HEAD
An insightful journey into understanding the patterns behind road fatalities in the U.S. and creating actionable insights for a safer future.
Motor vehicle accidents are a leading cause of unintentional injury-related deaths in the U.S. Using the 2022 FARS dataset, our analysis focuses on revealing trends and risk factors that contribute to fatal crashes.
Our first step is getting ready by loading the necessary packages and the data.
# Loading the necessary libraries
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(PMCMRplus)
library(here)
## here() starts at /Users/chengyuhan/Documents/GitHub/DATS_6101_TEAM5
library(shiny)
# Loading the dataset
dataset <- read_csv("/Users/chengyuhan/Desktop/NTAD_Fatality_Analysis_Reporting_System_2022_Accidents.csv")
## Rows: 39480 Columns: 83
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (34): STATENAME, COUNTYNAME, CITYNAME, MONTHNAME, DAY_WEEKNAME, HOURNAME...
## dbl (49): OBJECTID, STATE, ST_CASE, PEDS, PERNOTMVIT, VE_TOTAL, VE_FORMS, PV...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Now viewing the structure of the dataset
str(dataset)
## spc_tbl_ [39,480 × 83] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ OBJECTID : num [1:39480] 1 2 3 4 5 6 7 8 9 10 ...
## $ STATE : num [1:39480] 1 1 1 1 1 1 1 1 1 1 ...
## $ STATENAME : chr [1:39480] "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ ST_CASE : num [1:39480] 10001 10002 10003 10004 10005 ...
## $ PEDS : num [1:39480] 0 0 0 0 1 1 0 0 0 0 ...
## $ PERNOTMVIT : num [1:39480] 0 0 0 0 1 1 0 0 0 0 ...
## $ VE_TOTAL : num [1:39480] 2 2 1 1 1 1 2 1 2 1 ...
## $ VE_FORMS : num [1:39480] 2 2 1 1 1 1 2 1 2 1 ...
## $ PVH_INVL : num [1:39480] 0 0 0 0 0 0 0 0 0 0 ...
## $ PERSONS : num [1:39480] 3 5 2 1 1 5 1 1 3 1 ...
## $ PERMVIT : num [1:39480] 3 5 2 1 1 5 1 1 3 1 ...
## $ COUNTY : num [1:39480] 107 101 115 101 73 101 63 101 71 131 ...
## $ COUNTYNAME : chr [1:39480] "PICKENS (107)" "MONTGOMERY (101)" "ST. CLAIR (115)" "MONTGOMERY (101)" ...
## $ CITY : num [1:39480] 0 0 0 0 0 2130 0 0 0 0 ...
## $ CITYNAME : chr [1:39480] "NOT APPLICABLE" "NOT APPLICABLE" "NOT APPLICABLE" "NOT APPLICABLE" ...
## $ MONTH : num [1:39480] 1 1 1 1 1 1 1 1 1 1 ...
## $ MONTHNAME : chr [1:39480] "January" "January" "January" "January" ...
## $ DAY : num [1:39480] 1 1 1 2 2 2 4 4 4 5 ...
## $ DAYNAME : num [1:39480] 1 1 1 2 2 2 4 4 4 5 ...
## $ DAY_WEEK : num [1:39480] 7 7 7 1 1 1 3 3 3 4 ...
## $ DAY_WEEKNAME: chr [1:39480] "Saturday" "Saturday" "Saturday" "Sunday" ...
## $ YEAR : num [1:39480] 2022 2022 2022 2022 2022 ...
## $ HOUR : num [1:39480] 12 16 1 14 18 18 9 14 11 0 ...
## $ HOURNAME : chr [1:39480] "12:00pm-12:59pm" "4:00pm-4:59pm" "1:00am-1:59am" "2:00pm-2:59pm" ...
## $ MINUTE : num [1:39480] 30 40 33 46 48 28 5 50 40 0 ...
## $ MINUTENAME : chr [1:39480] "30" "40" "33" "46" ...
## $ TWAY_ID : chr [1:39480] "US-82 SR-6" "US-231 SR-53" "CR-KELLY CREEK RD" "I-65" ...
## $ TWAY_ID2 : chr [1:39480] NA NA NA NA ...
## $ ROUTE : num [1:39480] 2 2 4 1 1 6 4 4 2 3 ...
## $ ROUTENAME : chr [1:39480] "US Highway" "US Highway" "County Road" "Interstate" ...
## $ RUR_URB : num [1:39480] 1 1 1 1 2 2 1 1 1 1 ...
## $ RUR_URBNAME : chr [1:39480] "Rural" "Rural" "Rural" "Rural" ...
## $ FUNC_SYS : num [1:39480] 3 3 5 1 1 4 5 5 3 3 ...
## $ FUNC_SYSNAME: chr [1:39480] "Principal Arterial - Other" "Principal Arterial - Other" "Major Collector" "Interstate" ...
## $ RD_OWNER : num [1:39480] 1 1 2 1 1 4 2 2 1 1 ...
## $ RD_OWNERNAME: chr [1:39480] "State Highway Agency" "State Highway Agency" "County Highway Agency" "State Highway Agency" ...
## $ NHS : num [1:39480] 1 1 0 1 1 0 0 0 1 1 ...
## $ NHSNAME : chr [1:39480] "This section IS ON the NHS" "This section IS ON the NHS" "This section IS NOT on the NHS" "This section IS ON the NHS" ...
## $ SP_JUR : num [1:39480] 0 0 0 0 0 0 0 0 0 0 ...
## $ SP_JURNAME : chr [1:39480] "No Special Jurisdiction" "No Special Jurisdiction" "No Special Jurisdiction" "No Special Jurisdiction" ...
## $ MILEPT : num [1:39480] 4 974 0 1595 1342 ...
## $ MILEPTNAME : chr [1:39480] "4" "974" "None" "1595" ...
## $ LATITUDE : num [1:39480] 33.5 32.1 33.4 32.2 33.5 ...
## $ LATITUDENAME: num [1:39480] 33.5 32.1 33.4 32.2 33.5 ...
## $ LONGITUD : num [1:39480] -88.3 -86.1 -86.4 -86.4 -86.7 ...
## $ LONGITUDNAME: num [1:39480] -88.3 -86.1 -86.4 -86.4 -86.7 ...
## $ HARM_EV : num [1:39480] 12 12 42 34 8 8 12 38 12 42 ...
## $ HARM_EVNAME : chr [1:39480] "Motor Vehicle In-Transport" "Motor Vehicle In-Transport" "Tree (Standing Only)" "Ditch" ...
## $ MAN_COLL : num [1:39480] 7 2 0 0 0 0 1 0 6 0 ...
## $ MAN_COLLNAME: chr [1:39480] "Sideswipe - Same Direction" "Front-to-Front" "The First Harmful Event was Not a Collision with a Motor Vehicle in Transport" "The First Harmful Event was Not a Collision with a Motor Vehicle in Transport" ...
## $ RELJCT1 : num [1:39480] 0 0 0 0 0 0 0 0 0 0 ...
## $ RELJCT1NAME : chr [1:39480] "No" "No" "No" "No" ...
## $ RELJCT2 : num [1:39480] 1 1 1 1 1 1 1 1 2 1 ...
## $ RELJCT2NAME : chr [1:39480] "Non-Junction" "Non-Junction" "Non-Junction" "Non-Junction" ...
## $ TYP_INT : num [1:39480] 1 1 1 1 1 1 1 1 2 1 ...
## $ TYP_INTNAME : chr [1:39480] "Not an Intersection" "Not an Intersection" "Not an Intersection" "Not an Intersection" ...
## $ REL_ROAD : num [1:39480] 1 1 4 4 2 1 1 4 1 4 ...
## $ REL_ROADNAME: chr [1:39480] "On Roadway" "On Roadway" "On Roadside" "On Roadside" ...
## $ WRK_ZONE : num [1:39480] 0 0 0 0 0 0 0 0 0 0 ...
## $ WRK_ZONENAME: chr [1:39480] "None" "None" "None" "None" ...
## $ LGT_COND : num [1:39480] 1 1 2 1 2 3 1 1 1 2 ...
## $ LGT_CONDNAME: chr [1:39480] "Daylight" "Daylight" "Dark - Not Lighted" "Daylight" ...
## $ WEATHER : num [1:39480] 1 1 10 10 2 1 1 1 1 1 ...
## $ WEATHERNAME : chr [1:39480] "Clear" "Clear" "Cloudy" "Cloudy" ...
## $ SCH_BUS : num [1:39480] 0 0 0 0 0 0 0 0 0 0 ...
## $ SCH_BUSNAME : chr [1:39480] "No" "No" "No" "No" ...
## $ RAIL : chr [1:39480] "0000000" "0000000" "0000000" "0000000" ...
## $ RAILNAME : chr [1:39480] "Not Applicable" "Not Applicable" "Not Applicable" "Not Applicable" ...
## $ NOT_HOUR : num [1:39480] 12 99 1 14 18 18 99 99 11 0 ...
## $ NOT_HOURNAME: chr [1:39480] "12:00pm-12:59pm" "Unknown" "1:00am-1:59am" "2:00pm-2:59pm" ...
## $ NOT_MIN : num [1:39480] 47 99 33 48 48 26 99 99 36 0 ...
## $ NOT_MINNAME : chr [1:39480] "47" "Unknown" "33" "48" ...
## $ ARR_HOUR : num [1:39480] 13 99 1 15 18 18 99 99 11 0 ...
## $ ARR_HOURNAME: chr [1:39480] "1:00pm-1:59pm" "Unknown EMS Scene Arrival Hour" "1:00am-1:59am" "3:00pm-3:59pm" ...
## $ ARR_MIN : num [1:39480] 4 99 50 9 54 32 99 99 54 33 ...
## $ ARR_MINNAME : chr [1:39480] "4" "Unknown EMS Scene Arrival Minutes" "50" "9" ...
## $ HOSP_HR : num [1:39480] 13 99 99 15 88 99 88 99 12 88 ...
## $ HOSP_HRNAME : chr [1:39480] "1:00pm-1:59pm" "Unknown" "Unknown" "3:00pm-3:59pm" ...
## $ HOSP_MN : num [1:39480] 47 99 99 44 88 99 88 99 41 88 ...
## $ HOSP_MNNAME : chr [1:39480] "47" "Unknown EMS Hospital Arrival Time" "Unknown EMS Hospital Arrival Time" "44" ...
## $ FATALS : num [1:39480] 1 2 1 1 1 1 1 1 1 1 ...
## $ x : num [1:39480] -88.3 -86.1 -86.4 -86.4 -86.7 ...
## $ y : num [1:39480] 33.5 32.1 33.4 32.2 33.5 ...
## - attr(*, "spec")=
## .. cols(
## .. OBJECTID = col_double(),
## .. STATE = col_double(),
## .. STATENAME = col_character(),
## .. ST_CASE = col_double(),
## .. PEDS = col_double(),
## .. PERNOTMVIT = col_double(),
## .. VE_TOTAL = col_double(),
## .. VE_FORMS = col_double(),
## .. PVH_INVL = col_double(),
## .. PERSONS = col_double(),
## .. PERMVIT = col_double(),
## .. COUNTY = col_double(),
## .. COUNTYNAME = col_character(),
## .. CITY = col_double(),
## .. CITYNAME = col_character(),
## .. MONTH = col_double(),
## .. MONTHNAME = col_character(),
## .. DAY = col_double(),
## .. DAYNAME = col_double(),
## .. DAY_WEEK = col_double(),
## .. DAY_WEEKNAME = col_character(),
## .. YEAR = col_double(),
## .. HOUR = col_double(),
## .. HOURNAME = col_character(),
## .. MINUTE = col_double(),
## .. MINUTENAME = col_character(),
## .. TWAY_ID = col_character(),
## .. TWAY_ID2 = col_character(),
## .. ROUTE = col_double(),
## .. ROUTENAME = col_character(),
## .. RUR_URB = col_double(),
## .. RUR_URBNAME = col_character(),
## .. FUNC_SYS = col_double(),
## .. FUNC_SYSNAME = col_character(),
## .. RD_OWNER = col_double(),
## .. RD_OWNERNAME = col_character(),
## .. NHS = col_double(),
## .. NHSNAME = col_character(),
## .. SP_JUR = col_double(),
## .. SP_JURNAME = col_character(),
## .. MILEPT = col_double(),
## .. MILEPTNAME = col_character(),
## .. LATITUDE = col_double(),
## .. LATITUDENAME = col_double(),
## .. LONGITUD = col_double(),
## .. LONGITUDNAME = col_double(),
## .. HARM_EV = col_double(),
## .. HARM_EVNAME = col_character(),
## .. MAN_COLL = col_double(),
## .. MAN_COLLNAME = col_character(),
## .. RELJCT1 = col_double(),
## .. RELJCT1NAME = col_character(),
## .. RELJCT2 = col_double(),
## .. RELJCT2NAME = col_character(),
## .. TYP_INT = col_double(),
## .. TYP_INTNAME = col_character(),
## .. REL_ROAD = col_double(),
## .. REL_ROADNAME = col_character(),
## .. WRK_ZONE = col_double(),
## .. WRK_ZONENAME = col_character(),
## .. LGT_COND = col_double(),
## .. LGT_CONDNAME = col_character(),
## .. WEATHER = col_double(),
## .. WEATHERNAME = col_character(),
## .. SCH_BUS = col_double(),
## .. SCH_BUSNAME = col_character(),
## .. RAIL = col_character(),
## .. RAILNAME = col_character(),
## .. NOT_HOUR = col_double(),
## .. NOT_HOURNAME = col_character(),
## .. NOT_MIN = col_double(),
## .. NOT_MINNAME = col_character(),
## .. ARR_HOUR = col_double(),
## .. ARR_HOURNAME = col_character(),
## .. ARR_MIN = col_double(),
## .. ARR_MINNAME = col_character(),
## .. HOSP_HR = col_double(),
## .. HOSP_HRNAME = col_character(),
## .. HOSP_MN = col_double(),
## .. HOSP_MNNAME = col_character(),
## .. FATALS = col_double(),
## .. x = col_double(),
## .. y = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(dataset)
## OBJECTID STATE STATENAME ST_CASE
## Min. : 1 Min. : 1.0 Length:39480 Min. : 10001
## 1st Qu.: 9871 1st Qu.:12.0 Class :character 1st Qu.:121923
## Median :19740 Median :26.0 Mode :character Median :261050
## Mean :19740 Mean :27.2 Mean :272763
## 3rd Qu.:29610 3rd Qu.:42.0 3rd Qu.:420773
## Max. :39480 Max. :56.0 Max. :560118
##
## PEDS PERNOTMVIT VE_TOTAL VE_FORMS PVH_INVL
## Min. : 0.0 Min. : 0.0 Min. : 1.0 Min. : 1.0 Min. : 0.00
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 1.0 1st Qu.: 1.0 1st Qu.: 0.00
## Median : 0.0 Median : 0.0 Median : 1.0 Median : 1.0 Median : 0.00
## Mean : 0.2 Mean : 0.3 Mean : 1.6 Mean : 1.5 Mean : 0.04
## 3rd Qu.: 0.0 3rd Qu.: 0.0 3rd Qu.: 2.0 3rd Qu.: 2.0 3rd Qu.: 0.00
## Max. :73.0 Max. :73.0 Max. :50.0 Max. :50.0 Max. :10.00
##
## PERSONS PERMVIT COUNTY COUNTYNAME
## Min. : 0.0 Min. : 0.0 Min. : 0 Length:39480
## 1st Qu.: 1.0 1st Qu.: 1.0 1st Qu.: 31 Class :character
## Median : 2.0 Median : 2.0 Median : 69 Mode :character
## Mean : 2.2 Mean : 2.2 Mean : 92
## 3rd Qu.: 3.0 3rd Qu.: 3.0 3rd Qu.:115
## Max. :128.0 Max. :128.0 Max. :999
##
## CITY CITYNAME MONTH MONTHNAME
## Min. : 0 Length:39480 Min. : 1.0 Length:39480
## 1st Qu.: 0 Class :character 1st Qu.: 4.0 Class :character
## Median : 190 Mode :character Median : 7.0 Mode :character
## Mean :1308 Mean : 6.7
## 3rd Qu.:2010 3rd Qu.:10.0
## Max. :9999 Max. :12.0
##
## DAY DAYNAME DAY_WEEK DAY_WEEKNAME YEAR
## Min. : 1.0 Min. : 1.0 Min. :1.00 Length:39480 Min. :2022
## 1st Qu.: 8.0 1st Qu.: 8.0 1st Qu.:2.00 Class :character 1st Qu.:2022
## Median :16.0 Median :16.0 Median :4.00 Mode :character Median :2022
## Mean :15.7 Mean :15.7 Mean :4.13 Mean :2022
## 3rd Qu.:23.0 3rd Qu.:23.0 3rd Qu.:6.00 3rd Qu.:2022
## Max. :31.0 Max. :31.0 Max. :7.00 Max. :2022
##
## HOUR HOURNAME MINUTE MINUTENAME
## Min. : 0.0 Length:39480 Min. : 0.0 Length:39480
## 1st Qu.: 7.0 Class :character 1st Qu.:14.0 Class :character
## Median :14.0 Mode :character Median :30.0 Mode :character
## Mean :13.5 Mean :29.3
## 3rd Qu.:19.0 3rd Qu.:45.0
## Max. :99.0 Max. :99.0
##
## TWAY_ID TWAY_ID2 ROUTE ROUTENAME
## Length:39480 Length:39480 Min. :1.00 Length:39480
## Class :character Class :character 1st Qu.:2.00 Class :character
## Mode :character Mode :character Median :3.00 Mode :character
## Mean :3.87
## 3rd Qu.:6.00
## Max. :9.00
##
## RUR_URB RUR_URBNAME FUNC_SYS FUNC_SYSNAME
## Min. :1.00 Length:39480 Min. : 1.0 Length:39480
## 1st Qu.:1.00 Class :character 1st Qu.: 3.0 Class :character
## Median :2.00 Mode :character Median : 4.0 Mode :character
## Mean :1.63 Mean : 4.6
## 3rd Qu.:2.00 3rd Qu.: 5.0
## Max. :9.00 Max. :99.0
##
## RD_OWNER RD_OWNERNAME NHS NHSNAME
## Min. : 1.0 Length:39480 Min. :0.00 Length:39480
## 1st Qu.: 1.0 Class :character 1st Qu.:0.00 Class :character
## Median : 1.0 Mode :character Median :0.00 Mode :character
## Mean :13.5 Mean :0.48
## 3rd Qu.: 4.0 3rd Qu.:1.00
## Max. :99.0 Max. :9.00
##
## SP_JUR SP_JURNAME MILEPT MILEPTNAME
## Min. :0.00 Length:39480 Min. : 0 Length:39480
## 1st Qu.:0.00 Class :character 1st Qu.: 19 Class :character
## Median :0.00 Mode :character Median : 138 Mode :character
## Mean :0.03 Mean :27695
## 3rd Qu.:0.00 3rd Qu.:99998
## Max. :9.00 Max. :99999
##
## LATITUDE LATITUDENAME LONGITUD LONGITUDNAME HARM_EV
## Min. : 18.0 Min. : 18.0 Min. :-159 Min. :-159 Min. : 1.0
## 1st Qu.: 32.9 1st Qu.: 32.9 1st Qu.: -99 1st Qu.: -99 1st Qu.: 8.0
## Median : 36.1 Median : 36.1 Median : -88 Median : -88 Median :12.0
## Mean : 36.6 Mean : 36.6 Mean : -87 Mean : -87 Mean :17.9
## 3rd Qu.: 40.4 3rd Qu.: 40.4 3rd Qu.: -81 3rd Qu.: -81 3rd Qu.:26.0
## Max. :100.0 Max. :100.0 Max. :1000 Max. :1000 Max. :99.0
##
## HARM_EVNAME MAN_COLL MAN_COLLNAME RELJCT1
## Length:39480 Min. : 0 Length:39480 Min. :0.00
## Class :character 1st Qu.: 0 Class :character 1st Qu.:0.00
## Mode :character Median : 0 Mode :character Median :0.00
## Mean : 2 Mean :0.08
## 3rd Qu.: 2 3rd Qu.:0.00
## Max. :99 Max. :9.00
##
## RELJCT1NAME RELJCT2 RELJCT2NAME TYP_INT
## Length:39480 Min. : 1.0 Length:39480 Min. : 1.0
## Class :character 1st Qu.: 1.0 Class :character 1st Qu.: 1.0
## Mode :character Median : 1.0 Mode :character Median : 1.0
## Mean : 2.5 Mean : 1.8
## 3rd Qu.: 2.0 3rd Qu.: 2.0
## Max. :99.0 Max. :99.0
##
## TYP_INTNAME REL_ROAD REL_ROADNAME WRK_ZONE
## Length:39480 Min. : 1.0 Length:39480 Min. :0.00
## Class :character 1st Qu.: 1.0 Class :character 1st Qu.:0.00
## Mode :character Median : 1.0 Mode :character Median :0.00
## Mean : 2.4 Mean :0.04
## 3rd Qu.: 4.0 3rd Qu.:0.00
## Max. :99.0 Max. :4.00
##
## WRK_ZONENAME LGT_COND LGT_CONDNAME WEATHER
## Length:39480 Min. :1.00 Length:39480 Min. : 1
## Class :character 1st Qu.:1.00 Class :character 1st Qu.: 1
## Mode :character Median :2.00 Mode :character Median : 1
## Mean :1.99 Mean : 6
## 3rd Qu.:3.00 3rd Qu.: 1
## Max. :9.00 Max. :99
##
## WEATHERNAME SCH_BUS SCH_BUSNAME RAIL
## Length:39480 Min. :0.000 Length:39480 Length:39480
## Class :character 1st Qu.:0.000 Class :character Class :character
## Mode :character Median :0.000 Mode :character Mode :character
## Mean :0.003
## 3rd Qu.:0.000
## Max. :1.000
##
## RAILNAME NOT_HOUR NOT_HOURNAME NOT_MIN
## Length:39480 Min. : 0.0 Length:39480 Min. : 0.0
## Class :character 1st Qu.:16.0 Class :character 1st Qu.:35.0
## Mode :character Median :99.0 Mode :character Median :98.0
## Mean :61.4 Mean :69.8
## 3rd Qu.:99.0 3rd Qu.:99.0
## Max. :99.0 Max. :99.0
##
## NOT_MINNAME ARR_HOUR ARR_HOURNAME ARR_MIN
## Length:39480 Min. : 0.0 Length:39480 Min. : 0.0
## Class :character 1st Qu.:16.0 Class :character 1st Qu.:37.0
## Mode :character Median :99.0 Mode :character Median :98.0
## Mean :62.7 Mean :70.8
## 3rd Qu.:99.0 3rd Qu.:99.0
## Max. :99.0 Max. :99.0
##
## ARR_MINNAME HOSP_HR HOSP_HRNAME HOSP_MN
## Length:39480 Min. : 0.0 Length:39480 Min. : 0
## Class :character 1st Qu.:88.0 Class :character 1st Qu.:88
## Mode :character Median :88.0 Mode :character Median :88
## Mean :77.8 Mean :81
## 3rd Qu.:99.0 3rd Qu.:99
## Max. :99.0 Max. :99
##
## HOSP_MNNAME FATALS x y
## Length:39480 Min. :1.00 Min. :-159.5 Min. :18.0
## Class :character 1st Qu.:1.00 1st Qu.: -99.1 1st Qu.:32.9
## Mode :character Median :1.00 Median : -88.0 Median :36.1
## Mean :1.08 Mean : -92.7 Mean :36.3
## 3rd Qu.:1.00 3rd Qu.: -81.5 3rd Qu.:40.3
## Max. :9.00 Max. : -65.5 Max. :71.3
## NA's :232 NA's :232
dataset <- dataset %>% distinct()
missing_values <- colSums(is.na(dataset))
print(missing_values[missing_values > 0])
## TWAY_ID2 x y
## 29573 232 232
# Drop columns with missing values
dataset <- dataset %>%
select(
-c("TWAY_ID2",
"x",
"y"))
Dropping columns with duplicated meaning and irrelevant
dataset <- dataset %>% select(-STATE, -CITY, -COUNTY, -MONTH, -DAYNAME, -DAY_WEEK, -MINUTENAME, -ROUTE, -RUR_URB, -FUNC_SYS, -RD_OWNER, -NHS, -SP_JUR, -LATITUDENAME, -LONGITUDNAME, -MILEPT, -HARM_EV, -MAN_COLL, -MILEPTNAME, -RELJCT1, -RELJCT2, -TYP_INT, -REL_ROAD, -WRK_ZONE, -LGT_COND, -WEATHER, -SCH_BUS, -RAIL, -NOT_MIN, -ARR_MINNAME, -ARR_HOUR, -HOSP_MN, -SCH_BUSNAME, -RAILNAME, -PERNOTMVIT, -VE_FORMS, -PERSONS)
# Lets rename columns for consistency
colnames(dataset)<- str_to_lower(colnames(dataset))
colnames(dataset) <- str_replace_all(colnames(dataset), " ", "_")
colnames(dataset)
# Convert some variables into factor variables
dataset$rur_urbname <- factor(dataset$rur_urbname)
dataset$day_weekname <- factor(dataset$day_weekname, levels = c("Sunday","Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
dataset$func_sysname <- factor(dataset$func_sysname)
dataset$lgt_condname <- factor(dataset$lgt_condname)
dataset$weathername <-factor(dataset$weathername)
dataset$statename <- factor(dataset$statename)
str(dataset)
columns <- c("ve_total", "peds", "permvit", "arr_min", "hosp_hr")
dataset_long <- dataset %>%
select(all_of(columns)) %>% # Select specified columns
pivot_longer(cols = everything(), names_to = "variable", values_to = "value")
# Plotting boxplots to visualize outliers
ggplot(dataset_long, aes(x = variable, y = value)) +
geom_boxplot(outlier.color = "red", outlier.shape = 16, outlier.size = 2) +
facet_wrap(~ variable, scales = "free") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Boxplots of Selected Columns to Identify Outliers",
x = "Variables",
y = "Values")
remove_outliers <- function(df, columns) {
df %>% mutate(across(all_of(columns), ~ ifelse(
. < (quantile(., 0.25, na.rm = TRUE) - 1.5 * IQR(., na.rm = TRUE)) |
. > (quantile(., 0.75, na.rm = TRUE) + 1.5 * IQR(., na.rm = TRUE)),
NA, .))) %>%
drop_na()
}
dataset <- remove_outliers(dataset, columns)
# Reshape data to long format for boxplot visualization after removing outliers
dataset_long <- dataset %>%
select(all_of(columns)) %>%
pivot_longer(cols = everything(), names_to = "variable", values_to = "value")
# Plotting boxplots to visualize the cleaned data without outliers
ggplot(dataset_long, aes(x = variable, y = value)) +
geom_boxplot(outlier.color = "red", outlier.shape = 16, outlier.size = 2) +
facet_wrap(~ variable, scales = "free") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Boxplots of Selected Columns After Outlier Removal",
x = "Variables",
y = "Values")
#{r, include=T, results='markup',message=TRUE} #write.csv(dataset, "cleaned_dataset.csv", row.names = FALSE) #
road_data <- dataset %>%
select(rur_urbname) %>%
drop_na() %>%
filter(!rur_urbname %in% c("Unknown", "Not Reported", "Trafficway Not in State Inventory")) %>%
mutate(
rur_urbname = factor(rur_urbname, labels = c("Rural", "Urban"))
)
Research Question 1: How do temporal factors affect fatal motor vehicle accidents?
SMART question 1: How do day of week and hour of day affect the occurrence and severity of fatal accidents?
SMART question 2: Does day of week affect the day to day variation on total fatalities per day?
Subset for temporal question
# Select columns consists of temporal factors (month, day, day_week, hour) and no of fatalities (fatals)
df_temporal <- dataset %>%
select("monthname",
"day",
"day_weekname",
"hour",
"fatals")
# Create a new column to stored the date
df_temporal <- df_temporal %>%
mutate(
date = as.Date(paste("2022", monthname, day, sep = "-"), format = "%Y-%b-%d"),
day_type = ifelse(day_weekname %in% c("Saturday", "Sunday"), "Weekend", "Weekday"),
fatal_category = ifelse(fatals > 1, "multiple", "one"),
time_of_day = case_when(
hour < 6 ~ "Early Morning",
hour >= 6 & hour <12 ~ "Morning",
hour >= 12 & hour < 18 ~ "Afternoon",
hour >= 18 ~ "Evening"))
# convert into factor variable and set up level
df_temporal$day_weekname <- factor(df_temporal$day_weekname, levels = c("Sunday","Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
df_temporal$time_of_day <- factor(df_temporal$time_of_day, levels = c("Early Morning", "Morning", "Afternoon", "Evening"))
Check if every date in 2022 is included in the data frame, all dates in 2022 are included.
Frequency Analysis: Day of Week and Weekday vs Weekend This section calculates the frequency of occurrences for each day of the week and categorizes the results into weekdays and weekends.
Day of Week Analysis: Sunday to Friday each has frequency of 52 occurrences, while Saturday has 53. The distribution across the days is balanced.
Weekday vs Weekend Analysis: Weekdays have a total of 260 counts, and weekends have 105 counts.
# Filter out unknown hour (hour==99)
df_hour <- df_temporal %>%
filter(hour != 99)
df_hour_accidents <- df_hour %>%
group_by(hour) %>%
summarise(total_accidents = n())%>%
ungroup()
df_hour_accidents
# Top 5 hours with the highest number of fatal accidents
#head(df_hour_accidents[order(df_hour_accidents$total_accidents, decreasing = TRUE), ], 5)
# Top 5 hours with the lowest number of fatal accidents
#head(df_hour_accidents[order(df_hour_accidents$total_accidents), ], 5)
The peak hours for the highest number of accidents occur between 6 PM and 10 PM #### Day of Week
df_wday_accidents <- df_temporal %>%
group_by(day_weekname) %>%
summarise(total_accidents = n())%>%
ungroup()
df_wday_accidents
Saturday and Sunday appear to have higher count of fatal accidents than rest of days of week.
# Count of Fatal Accidents
df_hour %>% ggplot(aes(x= hour, fill = time_of_day)) +
geom_bar()+
labs(x = "hour", y = "Number of Fatal Accidents",
title = "Count of Fatal Accidents by Hour",
fill = "time of day")+
scale_fill_manual(values = c("#999999","#E69F00", "#D55E00", "#0072B2"))+
theme_minimal()
Given the patterns in hour of day, we will categorize day into two time group: Early morning, morning, afternoon, and evening.
df_wday_accidents <- df_wday_accidents %>%
mutate(day_type = ifelse(day_weekname %in% c("Saturday", "Sunday"), "Weekend", "Weekday"))
# Data Visualization on Occurrence of fatal accidents per day of week
df_temporal %>% ggplot(aes(x= day_weekname, fill = day_type)) +
geom_bar()+
labs(x = "Day of the Week", y = "Number of Fatal Accidents",
title = "Frequency of Fatal Accidents by Day of the Week",
fill = "Day of Week") +
scale_fill_manual(values = c("#bfbfbf", "#0072B2"),
labels = c("weekend", "weekday")) +
theme_minimal()
We use Chi-squared test to determine whether accidents are uniformly distributed across time of the day/days of week. #### Hour of Day Observed from the patterns of occurrence of accidents across hours of day and the usual practice, we will divide the hour of day into four groups.
# chi-squared test on the occurrence of accident
table(df_hour$time_of_day)
chisq_result <- chisq.test(table(df_hour$time_of_day))
chisq_result
# Get observed and expected counts
observed <- chisq_result$observed
expected <- chisq_result$expected
# Calculate standardized residuals
standardized_residuals <- (observed - expected) / sqrt(expected)
print(standardized_residuals)
In the data set, six days have 52 occurrences and one has 53. Even though the days are not perfectly equal but the impact will likely be minimal given the small difference between 52 and 53. For simplicity and common assumption, I will be using equal distribution across all seven days in the test.
# chi-squared test on the occurrence of accident
table(df_temporal$day_weekname)
chi_sq_test <- chisq.test(table(df_temporal$day_weekname))
print(chi_sq_test)
# Get observed and expected counts
observed <- chi_sq_test$observed
expected <- chi_sq_test$expected
# Calculate standardized residuals
standardized_residuals <- (observed - expected) / sqrt(expected)
print(standardized_residuals)
P-value for both test are less than 0.05, indicating that there is a statistically significant deviation from a uniform distribution, suggesting that certain hours and days have higher frequencies of accidents compared to others.
Since the majority of accident has one fatality in an accident and limited occurrence on accidents with fatalities more than four, we will group the number of fatalities into “one” and “multiple” fatalities.
hr_contingency <- table(df_hour$time_of_day, df_hour$fatal_category)
hr_contingency
# Severity: categorize accidents into accident with one fatality and multiple
fatality_counts <- df_temporal %>%
group_by(day_weekname, fatal_category) %>%
summarize(count = n())
df_temporal
wday_contingency <- table(df_temporal$day_weekname, df_temporal$fatal_category)
wday_contingency
# Convert contingency table into data frame and rename column names
df_hr_contingency <- data.frame(hr_contingency)
colnames(df_hr_contingency)<- c("time_of_day", "fatal_category", "Freq")
df_hr_contingency
#data viz
grouped_hr_contingency <- df_hr_contingency %>%
group_by(time_of_day) %>%
mutate(proportion = Freq / sum(Freq))
grouped_hr_contingency
grouped_hr_contingency %>%
ggplot(aes(x= time_of_day, y=proportion, fill = fatal_category))+
geom_bar(stat="identity", alpha = 0.75) +
facet_wrap(~ fatal_category,scales = "free_y")
# Convert contingency table into data frame and rename column names
df_wday_contingency <- data.frame(wday_contingency)
colnames(df_wday_contingency)<- c("day_weekname", "fatal_category", "Freq")
# data visualization on Severity by day of week
# Frequency Counts
#df_wday_contingency %>% ggplot(aes(x= day_weekname, y = Freq, fill = fatal_category))+
# geom_bar(stat="identity", alpha = 0.75) +
# facet_wrap(~ fatal_category)+
# labs(x = "Day of the Week", y = "Frequency") +
# theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Proportion
grouped_wday_contingency <- df_wday_contingency %>%
group_by(day_weekname) %>%
mutate(proportion = Freq / sum(Freq))
grouped_wday_contingency
grouped_wday_contingency %>%
ggplot(aes(x= day_weekname, y=proportion, fill = fatal_category))+
geom_bar(stat="identity", alpha = 0.75) +
facet_wrap(~ fatal_category, scales = "free_y") +
labs(x = "Day of the Week", y = "Proportion") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Chi-squared test: hour of day
chi_sq_result <- chisq.test(hr_contingency)
print(chi_sq_result)
# standardized residuals table
# get the expected and observed values
observed <- chi_sq_result$observed
expected <- chi_sq_result$expected
# calculate the standardized residuals
standardized_residuals <- (observed - expected) / sqrt(expected)
print(standardized_residuals)
P-value less than 0.05, we can reject the null hypothesis at 5% level of significance. This indicates a significant relationship between the number of fatalities per accident (‘one’ vs. ‘multiple’) and the time of the day.
chi_sq_test<- chisq.test(wday_contingency)
print(chi_sq_test)
# Get observed and expected counts
observed <- chi_sq_test$observed
expected <- chi_sq_test$expected
# Calculate standardized residuals
standardized_residuals <- (observed - expected) / sqrt(expected)
print(standardized_residuals)
The result reveals a clear pattern in the data, highlighting that weekends are associated with more severe accidents involving multiple fatalities.
Look further into total fatalities per day to provide a more nuanced undertanding on how does temporal factors affect fatal accidents.
# Data frame for daily fatalities
df_daily <- df_temporal %>%
group_by(date, day_weekname) %>%
summarise(
total_accidents = n(),
average_fatals = mean(fatals),
total_fatals = sum(fatals)
) %>%
ungroup()
df_daily <- df_temporal %>%
group_by(date, day_weekname) %>%
summarise(
total_accidents = n(),
average_fatals = mean(fatals),
total_fatals = sum(fatals)
) %>%
ungroup()
# convert to the right data type
df_daily$day_weekname <- factor(df_daily$day_weekname, levels = c("Sunday","Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
# Top 5 Highest Daily Fatalities
head(df_daily[order(df_daily$total_fatals, decreasing = TRUE), ], 5)
# Top 5 Lowest Daily Fatalities
tail(df_daily[order(df_daily$total_fatals, decreasing = TRUE), ], 5)
# names of days of week
df_daily
df_daily <- df_daily %>%
mutate(day_type = ifelse(day_weekname %in% c("Saturday", "Sunday"), "Weekend", "Weekday"))
# Boxplot of daily fatlities by day of week
df_daily %>% ggplot(aes(x = day_weekname, y = total_fatals, fill = day_type))+
geom_boxplot() +
labs(x = "Day of the Week",
y = "Daily Fatalities",
title = "Daily Fatalities by Day of the Week")+
scale_fill_manual(values = c("#bfbfbf","#0072B2"))
The boxplot shows that daily fatalities are generally higher on weekends. This suggests that weekends tend to have a higher count of fatalities, potentially due to increased travel.
df_summary_day_week <- df_daily %>%
group_by(day_weekname) %>%
summarise(
sample_size = n(),
avg_fatalities = mean(total_fatals),
std_fatalities = sd(total_fatals)) %>%
ungroup()
df_summary_day_week
The average and standard deviation of daily fatalities are notably higher on weekends, indicating both a higher mean and greater variability in fatalities compared to weekdays. The sample size is balanced across groups.
# QQ plot
for(day in unique(df_daily$day_weekname)) {
qqnorm(df_daily$total_fatals[df_daily$day_weekname == day],
main = paste("Q-Q Plot for ", day))
qqline(df_daily$total_fatals[df_daily$day_weekname == day], col = "blue")
}
# Shapiro: Normality test
df_daily %>%
group_by(day_weekname) %>%
summarise(normality_p_value = shapiro.test(total_fatals)$p.value)
# Levene test: Homogeneiety of Variance
leveneTest(total_fatals ~ day_weekname, data = df_daily)
Since the number of fatalities per day is relatively large and spans a wide range of values, the daily fatality counts approximate a continuous variable. Combined with sufficiently large sample sizes for each day of the week, the Central Limit Theorem applies, making the sampling distribution of the mean fatalities per day approximately normal. Additionally, normality tests on daily fatalities grouped by day of the week do not reject the assumption of normality. Therefore, we can treat fatalities per day as approximately continuous and appropriately use ANOVA to test for differences across the days of the week
After doing the Leven’s test for homogeneity, we can conclude that the variances across the groups are significantly different at 5% level of significance. This indicates that the homogeneity assumption of ANOVA test is violated. So we will be using Welch’s ANOVA test in the following section
# Welch's ANOVA
Wanova_result <- oneway.test(total_fatals ~ day_weekname, data = df_daily, var.equal = FALSE)
Wanova_result
# post hoc test-Games Howell
posthoc_result <- gamesHowellTest(total_fatals ~ day_weekname, data = df_daily)
print(posthoc_result)
p-values <0.05, there is a statistically significant difference (p-value < 0.05) in the number of daily fatalities across different days of the week. This indicates that day of week can affect daily fatality counts.
Let’s summarize the data to calculate the total fatalities by state.
state_county_summary <- dataset %>%
group_by(statename, countyname) %>%
summarise(Total_Fatalities = sum(fatals)) %>%
ungroup()
## `summarise()` has grouped output by 'statename'. You can override using the
## `.groups` argument.
state_summary <- state_county_summary %>%
group_by(statename) %>%
summarise(Total_Fatalities = sum(Total_Fatalities)) %>%
arrange(desc(Total_Fatalities))
ui <- fluidPage(
titlePanel("Drill-Down Analysis of Crash Fatalities by State and County"),
sidebarLayout(
sidebarPanel(
selectInput("state_select", "Select a State:", choices = unique(state_summary$statename), selected = "Texas")
),
mainPanel(
tabsetPanel(
tabPanel("State-Level Analysis", plotlyOutput("statePlot")), # State-level plot tab
tabPanel("County-Level Analysis", plotlyOutput("countyPlot")) # County-level plot tab
)
)
)
)
# Define server logic for the Shiny app
server <- function(input, output) {
# State-level Plot for Top 10 States by Total Fatalities
output$statePlot <- renderPlotly({
top_state_summary <- state_summary %>%
slice_head(n = 10) # Select top 10 states by total fatalities
state_bar_plot <- ggplot(top_state_summary, aes(x = reorder(statename, Total_Fatalities), y = Total_Fatalities, fill = Total_Fatalities)) +
geom_bar(stat = "identity", color = "black", width = 0.7) +
geom_text(aes(label = Total_Fatalities), hjust = -0.2, color = "black", size = 4) +
labs(title = "Top 10 States by Total Fatalities",
x = "State",
y = "Total Number of Fatalities") +
theme_minimal() +
scale_fill_gradient(low = "lightblue", high = "darkred") +
theme(
plot.title = element_text(hjust = 0.5, size = 16),
legend.position = "none"
) +
coord_flip() # Horizontal bar plot
ggplotly(state_bar_plot, tooltip = c("x", "y"))
})
# County-level Plot for Selected State
output$countyPlot <- renderPlotly({
# Filter county-level data for the selected state
county_data <- state_county_summary %>%
filter(statename == input$state_select) %>%
arrange(desc(Total_Fatalities)) %>%
slice_head(n = 10) # Select top 10 counties by fatalities within the selected state
county_bar_plot <- ggplot(county_data, aes(x = reorder(countyname, Total_Fatalities), y = Total_Fatalities, fill = Total_Fatalities)) +
geom_bar(stat = "identity", color = "black", width = 0.7) +
geom_text(aes(label = Total_Fatalities), hjust = -0.2, color = "black", size = 4) +
labs(title = paste("Top 10 Counties by Total Fatalities in", input$state_select),
x = "County",
y = "Total Number of Fatalities") +
theme_minimal() +
scale_fill_gradient(low = "lightyellow", high = "darkorange") +
theme(
plot.title = element_text(hjust = 0.5, size = 16),
legend.position = "none"
) +
coord_flip() # Horizontal bar plot for counties
ggplotly(county_bar_plot, tooltip = c("x", "y"))
})
}
# Run the Shiny app
shinyApp(ui = ui, server = server)
Research Question 2: What is the relationship between weather and lighting conditions on the occurrence of fatal accidents?
# Define UI for the Shiny app
ui <- fluidPage(
titlePanel("Fatalities by Weather and Lighting Conditions"),
sidebarLayout(
sidebarPanel(
helpText("Explore fatalities by weather and lighting conditions"),
hr()
),
mainPanel(
tabsetPanel(
tabPanel("Weather Conditions", plotlyOutput("weatherPlot")),
tabPanel("Lighting Conditions", plotlyOutput("lightingPlot"))
)
)
)
)
# Define server logic for Shiny app
server <- function(input, output) {
# Filter and aggregate data for weather conditions
weather_data <- reactive({
dataset %>%
filter(weathername != "Not Reported" & weathername != "Reported as Unknown") %>%
group_by(weathername) %>%
summarise(Total_Fatalities = sum(fatals, na.rm = TRUE)) %>%
arrange(desc(Total_Fatalities))
})
# Filter and aggregate data for lighting conditions
lighting_data <- reactive({
dataset %>%
filter(lgt_condname != "Not Reported" & lgt_condname != "Reported as Unknown") %>%
group_by(lgt_condname) %>%
summarise(Total_Fatalities = sum(fatals, na.rm = TRUE)) %>%
arrange(desc(Total_Fatalities))
})
# Plot for Weather Conditions with Log Scale and Original Value Labels
output$weatherPlot <- renderPlotly({
weather_plot <- ggplot(weather_data(), aes(x = reorder(weathername, Total_Fatalities), y = Total_Fatalities)) +
geom_bar(stat = "identity", fill = "skyblue") +
coord_flip() +
scale_y_log10() + # Apply logarithmic scale to y-axis
labs(title = "Fatalities by Weather Conditions", x = "Weather Condition", y = "Total Fatalities (Log Scale)") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) +
geom_text(aes(label = scales::comma(Total_Fatalities)), # Display original values on bars
hjust = -0.2, size = 3, color = "black") # Adjust positioning for readability
ggplotly(weather_plot, tooltip = c("x", "y"))
})
# Plot for Lighting Conditions with Log Scale and Original Value Labels
output$lightingPlot <- renderPlotly({
lighting_plot <- ggplot(lighting_data(), aes(x = reorder(lgt_condname, Total_Fatalities), y = Total_Fatalities)) +
geom_bar(stat = "identity", fill = "orange") +
coord_flip() +
scale_y_log10() + # Apply logarithmic scale to y-axis
labs(title = "Fatalities by Lighting Conditions", x = "Lighting Condition", y = "Total Fatalities (Log Scale)") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) +
geom_text(aes(label = scales::comma(Total_Fatalities)), # Display original values on bars
hjust = -0.2, size = 3, color = "black") # Adjust positioning for readability
ggplotly(lighting_plot, tooltip = c("x", "y"))
})
}
# Run the Shiny app
shinyApp(ui = ui, server = server)
## Filtering out the records that is unknown or not reported
dataset <- dataset %>%
filter(!(weathername %in% c("Not Reported", "Reported as Unknown")))
dataset <- dataset %>%
filter(!(lgt_condname %in% c("Not Reported", "Reported as Unknown")))
# Aggregating fatal counts by weather and lighting
weather_light <- dataset %>%
group_by(weathername, lgt_condname) %>%
summarize(total_fatals = sum(fatals, na.rm = TRUE))
## `summarise()` has grouped output by 'weathername'. You can override using the
## `.groups` argument.
# Interactive Heatmap
heatmap_plot <- plot_ly(weather_light, x = ~weathername, y = ~lgt_condname, z = ~total_fatals, type = "heatmap") %>%
layout(title = "Fatality Count by Weather and Lighting Condition",
xaxis = list(title = "Weather Condition"),
yaxis = list(title = "Lighting Condition"))
heatmap_plot
state_analysis <- dataset %>%
filter(weathername %in% c("Rain", "Sleet or Hail", "Blowing Snow","Blowing Sand,Soil Dirt","Severe Crosswinds","Fog,Smog,Smoke","Freezing Rain or Drizzle","Snow"), lgt_condname %in% c("Dark-Not Lighted", "Dark - Lighted","Dark- Unknown Lighting")) %>%
group_by(statename) %>%
summarise(Total_Fatalities = sum(fatals, na.rm = TRUE)) %>%
arrange(desc(Total_Fatalities))
# Select the top 10 states with the highest fatalities
top_adverse_states <- head(state_analysis, 10) # Top 10 states
ggplot(top_adverse_states, aes(x = reorder(statename, Total_Fatalities), y = Total_Fatalities, fill = Total_Fatalities)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_gradient(low = "blue", high = "red") +
labs(title = "Top 10 States by Fatal Accidents in Adverse Weather or Poor Lighting",
x = "State", y = "Total Fatalities") +
theme_minimal() +
theme(axis.text.y = element_text(size = 10), # Increase font size for readability
legend.position = "none") # Hide the legend
state_analysis <- dataset %>%
filter(weathername %in% c("Clear","Cloudy"), lgt_condname %in% c("Daylight", "Dawn","Dusk")) %>%
group_by(statename) %>%
summarise(Total_Fatalities = sum(fatals, na.rm = TRUE)) %>%
arrange(desc(Total_Fatalities))
# Select the top 10 states with the highest fatalities
top_states <- head(state_analysis, 10) # Top 10 states
ggplot(top_states, aes(x = reorder(statename, Total_Fatalities), y = Total_Fatalities, fill = Total_Fatalities)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_gradient(low = "blue", high = "red") +
labs(title = "States with highest Fatal Accidents in clear weather and good lighting",
x = "State", y = "Total Fatalities") +
theme_minimal() +
theme(axis.text.y = element_text(size = 10), # Increase font size for readability
legend.position = "none") # Hide the legend
anova_lighting <- aov(fatals ~ lgt_condname, data = dataset)
# Display ANOVA summary
summary(anova_lighting)
## Df Sum Sq Mean Sq F value Pr(>F)
## lgt_condname 6 3 0.534 4.52 0.00014 ***
## Residuals 21994 2597 0.118
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tukey_results <- TukeyHSD(anova_lighting, "lgt_condname")
print(tukey_results)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = fatals ~ lgt_condname, data = dataset)
##
## $lgt_condname
## diff lwr upr p adj
## Dark - Not Lighted-Dark - Lighted 0.00636 -0.0146 0.027261 0.973
## Dark - Unknown Lighting-Dark - Lighted 0.00457 -0.0529 0.062097 1.000
## Dawn-Dark - Lighted -0.03764 -0.0887 0.013414 0.310
## Daylight-Dark - Lighted -0.01869 -0.0376 0.000196 0.055
## Dusk-Dark - Lighted -0.00662 -0.0540 0.040757 1.000
## Other-Dark - Lighted 0.10072 -0.3527 0.554125 0.995
## Dark - Unknown Lighting-Dark - Not Lighted -0.00178 -0.0585 0.054969 1.000
## Dawn-Dark - Not Lighted -0.04399 -0.0942 0.006186 0.130
## Daylight-Dark - Not Lighted -0.02505 -0.0414 -0.008663 0.000
## Dusk-Dark - Not Lighted -0.01297 -0.0594 0.033460 0.983
## Other-Dark - Not Lighted 0.09436 -0.3590 0.547673 0.996
## Dawn-Dark - Unknown Lighting -0.04221 -0.1156 0.031203 0.619
## Daylight-Dark - Unknown Lighting -0.02327 -0.0793 0.032772 0.885
## Dusk-Dark - Unknown Lighting -0.01119 -0.0821 0.059715 0.999
## Other-Dark - Unknown Lighting 0.09614 -0.3603 0.552611 0.996
## Daylight-Dawn 0.01895 -0.0304 0.068322 0.919
## Dusk-Dawn 0.03102 -0.0347 0.096790 0.807
## Other-Dawn 0.13836 -0.3173 0.594054 0.973
## Dusk-Daylight 0.01207 -0.0335 0.057636 0.987
## Other-Daylight 0.11941 -0.3338 0.572632 0.987
## Other-Dusk 0.10734 -0.3480 0.562637 0.993
# Filter data for adverse and clear weather conditions
adverse_conditions <- c("Rain", "Sleet or Hail", "Blowing Snow","Blowing Sand,Soil Dirt","Severe Crosswinds","Fog,Smog,Smoke","Freezing Rain or Drizzle","Snow")
fair_conditions <- c("Clear","Cloudy")
# Summarize fatalities by state for adverse weather conditions
adverse_weather <- dataset %>%
filter(weathername %in% adverse_conditions) %>%
group_by(statename) %>%
summarise(Adverse_Fatalities = sum(fatals, na.rm = TRUE))
# Summarize fatalities by state for fair weather condition
fair_weather <- dataset %>%
filter(weathername == fair_conditions) %>%
group_by(statename) %>%
summarise(Clear_Fatalities = sum(fatals, na.rm = TRUE))
# Merge the data for adverse and clear weather fatalities by state
weather_comparison <- merge(adverse_weather, fair_weather, by = "statename", all = TRUE)
# Replace NA values with 0 (in case a state has no fatalities in one of the conditions)
weather_comparison[is.na(weather_comparison)] <- 0
# Perform the independent t-test
t_test_result <- t.test(weather_comparison$Adverse_Fatalities, weather_comparison$Clear_Fatalities,
alternative = "two.sided", var.equal = FALSE)
print(t_test_result)
##
## Welch Two Sample t-test
##
## data: weather_comparison$Adverse_Fatalities and weather_comparison$Clear_Fatalities
## t = -4, df = 53, p-value = 1e-04
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -257.8 -91.6
## sample estimates:
## mean of x mean of y
## 35.4 210.0
Research Question 3: How do road conditions (urban/rural, road type, and traffic controls) impact the severity of crashes?
road_fatalities <- dataset %>%
filter(func_sysname != "Unknown" & func_sysname != "Not Reported" & func_sysname != "Trafficway Not in State Inventory") %>%
select(func_sysname, fatals) %>%
drop_na() %>%
mutate(func_sysname = as.factor(func_sysname))
fatality_summary <- road_fatalities %>%
group_by(func_sysname) %>%
summarise(
Total_Fatalities = sum(fatals),
SD_Fatalities = sd(fatals), # Standard deviation for reference
Count = n()
)
# Print summary for inspection
print(fatality_summary)
## # A tibble: 7 × 4
## func_sysname Total_Fatalities SD_Fatalities Count
## <fct> <dbl> <dbl> <int>
## 1 Interstate 3223 0.360 2914
## 2 Local 2592 0.263 2446
## 3 Major Collector 4346 0.328 4012
## 4 Minor Arterial 5140 0.345 4703
## 5 Minor Collector 911 0.265 864
## 6 Principal Arterial - Other 6486 0.383 5867
## 7 Principal Arterial - Other Freeways and … 1149 0.350 1049
# Sort the summary data by Total Fatalities for better visual clarity
fatality_summary <- fatality_summary %>%
arrange(desc(Total_Fatalities))
# Enhanced bar plot with additional features
fatality_total_bar_plot <- ggplot(fatality_summary, aes(x = reorder(func_sysname, -Total_Fatalities), y = Total_Fatalities, fill = Total_Fatalities)) +
geom_bar(stat = "identity", color = "black", width = 0.7) +
geom_text(aes(label = Total_Fatalities), vjust = -0.5, color = "black", size = 3.5) + # Annotations for exact values
labs(title = "Total Fatalities by Road Type",
x = "Road Type",
y = "Total Number of Fatalities") +
theme_minimal() +
scale_fill_gradient(low = "lightblue", high = "darkred") + # Color gradient for emphasis
theme(
plot.title = element_text(hjust = 0.5, size = 16),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none"
)
fatality_total_bar_plotly <- ggplotly(fatality_total_bar_plot, tooltip = c("x", "y", "Total_Fatalities")) %>%
layout(
xaxis = list(title = "Road Type"),
yaxis = list(title = "Total Number of Fatalities"),
width = 900, # Adjust width for readability
height = 500 # Adjust height for a balanced look
)
fatality_total_bar_plotly
Since we’re now dealing with total counts rather than averages, a chi-square test can be more suitable than an ANOVA to determine if there are statistically significant differences in the distribution of fatalities across road types.
fatality_table <- table(road_fatalities$func_sysname, road_fatalities$fatals > 0)
chi_square_result <- chisq.test(fatality_table)
print(chi_square_result)
##
## Chi-squared test for given probabilities
##
## data: fatality_table
## X-squared = 18848, df = 9, p-value <2e-16
route_function_summary <- dataset %>%
group_by(routename, func_sysname) %>%
summarise(fatals = n()) %>%
arrange(desc(fatals)) %>%
ungroup()
## `summarise()` has grouped output by 'routename'. You can override using the
## `.groups` argument.
# Filter the top 10 high-frequency routes and functional systems
top_route_function_summary <- route_function_summary %>%
slice_head(n = 10)
crash_freq_plot <- ggplot(top_route_function_summary, aes(x = reorder(routename, fatals), y = fatals, fill = func_sysname)) +
geom_bar(stat = "identity", color = "black", width = 0.7) +
labs(title = "Top Routes and Functional Systems by Crash Frequency",
x = "Route Name",
y = "Fatals") +
theme_minimal() +
scale_fill_brewer(palette = "Paired") + # Colorful palette for road types
theme(
plot.title = element_text(hjust = 0.5, size = 16),
legend.title = element_text(size = 10),
legend.position = "right",
axis.text.x = element_text(angle = 45, hjust = 1)
) +
coord_flip() # Flip for better readability
crash_freq_plotly <- ggplotly(crash_freq_plot, tooltip = c("x", "y", "fill"))
crash_freq_plotly
crash_freq_heatmap <- ggplot(top_route_function_summary, aes(x = func_sysname, y = reorder(routename, fatals), fill = fatals)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "lightblue", high = "darkred", name = "Crash Count") +
labs(title = "Crash Frequency Heatmap by Route and Functional System",
x = "Functional System",
y = "Route Name") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 16),
axis.text.x = element_text(angle = 45, hjust = 1),
axis.text.y = element_text(size = 10)
)
crash_freq_heatmap_plotly <- ggplotly(crash_freq_heatmap, tooltip = c("x", "y", "fill"))
crash_freq_heatmap_plotly
# Summarize crash severity and frequency by functional system and urban/rural classification
summary_data <- dataset %>%
filter(rur_urbname != "Unknown" & rur_urbname != "Not Reported" & rur_urbname != "Trafficway Not in State Inventory") %>%
group_by(rur_urbname, func_sysname) %>%
summarise(
Avg_Severity = mean(fatals, na.rm = TRUE),
Crash_Frequency = n()
) %>%
ungroup()
## `summarise()` has grouped output by 'rur_urbname'. You can override using the
## `.groups` argument.
ui <- fluidPage(
titlePanel("Comparing Crash Characteristics on Urban vs. Rural Functional Systems"),
sidebarLayout(
sidebarPanel(
selectInput("system_select", "Select Functional System:", choices = unique(summary_data$func_sysname)),
hr(),
helpText("Select a functional system to compare crash severity and frequency in urban vs. rural areas.")
),
mainPanel(
tabsetPanel(
tabPanel("Crash Severity", plotlyOutput("severityPlot")),
tabPanel("Crash Frequency", plotlyOutput("frequencyPlot")),
tabPanel("Statistical Test Results", verbatimTextOutput("testResults"))
)
)
)
)
server <- function(input, output){
# Filter data based on selected functional system
filtered_data <- reactive({
summary_data %>%
filter(func_sysname == input$system_select)
})
# Plot: Average Crash Severity in Urban vs. Rural Areas
output$severityPlot <- renderPlotly({
severity_plot <- ggplot(filtered_data(), aes(x = rur_urbname, y = Avg_Severity, fill = rur_urbname)) +
geom_bar(stat = "identity", position = "dodge", color = "black") +
labs(title = paste("Average Crash Severity in", input$system_select),
x = "Area Type (Urban/Rural)",
y = "Average Crash Severity") +
scale_fill_manual(values = c("Urban" = "skyblue", "Rural" = "salmon")) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(severity_plot, tooltip = c("x", "y"))
})
# Plot: Crash Frequency in Urban vs. Rural Areas
output$frequencyPlot <- renderPlotly({
frequency_plot <- ggplot(filtered_data(), aes(x = rur_urbname, y = Crash_Frequency, fill = rur_urbname)) +
geom_bar(stat = "identity", position = "dodge", color = "black") +
labs(title = paste("Crash Frequency in", input$system_select),
x = "Area Type (Urban/Rural)",
y = "Crash Frequency") +
scale_fill_manual(values = c("Urban" = "lightgreen", "Rural" = "darkorange")) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(frequency_plot, tooltip = c("x", "y"))
})
# Perform t-tests to compare severity and frequency between Urban and Rural areas
output$testResults <- renderPrint({
data <- filtered_data()
# Check if we have enough data for t-tests
if (nrow(data) >= 2) {
severity_ttest <- t.test(Avg_Severity ~ rur_urbname, data = data)
frequency_ttest <- t.test(Crash_Frequency ~ rur_urbname, data = data)
cat("T-Test Results for Crash Severity:\n")
print(severity_ttest)
cat("\n\n")
cat("T-Test Results for Crash Frequency:\n")
print(frequency_ttest)
} else {
cat("Not enough data for t-tests.")
}
})
}
shinyApp(ui = ui, server = server)
monthname into a factor with a
defined order, we ensure that the boxplot accurately reflects the
seasonal variations in fatalities. This visualization highlights any
monthly patterns or trends in fatal accidents, allowing for a clearer
understanding of when fatalities are most likely to occur throughout the
year.# Create total_fatalities based on the fatals column
dataset$total_fatalities <- ifelse(dataset$fatals > 0, 1, 0)
# Boxplot of Fatalities by Month
ggplot(dataset, aes(x=factor(monthname, levels=month.name), y=fatals)) + # Convert monthname to a factor
geom_boxplot(fill="lightgreen") +
labs(title="Boxplot of Fatalities by Month", x="Month", y="Total Fatalities") +
theme_minimal()
total_fatalities variable, which aggregates
the fatalities from the dataset. We then perform the Shapiro-Wilk
normality test to assess whether the data is normally distributed. A Q-Q
plot is also generated to visually inspect the normality of the data
distribution, helping us understand the underlying characteristics of
the fatalities.str(dataset)
dataset$total_fatalities <- dataset$fatals
non_missing_count <- sum(!is.na(dataset$total_fatalities))
cat("Number of non-missing values in total_fatalities:", non_missing_count, "\\n")
unique_values <- unique(dataset$total_fatalities)
summary(dataset$total_fatalities)
cat("Unique values in total_fatalities:", unique_values, "\\n")
if (non_missing_count >= 3 && non_missing_count <= 5000) {
shapiro_test <- shapiro.test(dataset$total_fatalities)
cat("Shapiro-Wilk Normality Test:\\n")
print(shapiro_test)
} else {
cat("Insufficient sample size for Shapiro-Wilk test.\\n")
}
qqnorm(dataset$total_fatalities, main="Q-Q Plot for Total Fatalities")
qqline(dataset$total_fatalities, col="red")
###The Q-Q plot displays the quantiles of the sample data (total fatalities) against the quantiles of a theoretical normal distribution. If the data points closely follow the red line, it indicates that the data is normally distributed.
table_states <- table(dataset$statename)
chi_square_test <- chisq.test(table_states)
cat("Chi-Square Test Results:\n")
print(chi_square_test)
monthly_summary <- dataset %>%
group_by(monthname) %>%
summarize(total_fatalities = sum(fatals, na.rm = TRUE), .groups = 'drop')
monthly_summary$monthname <- factor(monthly_summary$monthname, levels = month.name)
ggplot(monthly_summary, aes(x=monthname, y=total_fatalities)) +
geom_line(group=1, color="blue", size=1.2) +
geom_point(color="red", size=3) +
labs(title="Trends in Fatal Accidents by Month (2022)",
x="Month",
y="Total Fatalities") +
theme_minimal()
# Summarizing total fatalities by state and month
state_monthly_summary <- dataset %>%
group_by(statename, monthname) %>% # Ensure correct column names
summarize(total_fatalities = sum(fatals, na.rm = TRUE), .groups = 'drop')
# Display the head of the summary
print(head(state_monthly_summary))
# Create a factor for monthname to ensure it is ordered correctly
state_monthly_summary$monthname <- factor(state_monthly_summary$monthname, levels = month.name)
# Plotting state-wise trends in fatalities by month
ggplot(state_monthly_summary, aes(x=monthname, y=total_fatalities, color=statename, group=statename)) +
geom_line(size=1, alpha=0.7) +
geom_point(size=2, alpha=0.8) +
scale_y_continuous(breaks = seq(0, max(state_monthly_summary$total_fatalities, na.rm = TRUE), by = 10)) +
labs(title="State-wise Trends in Fatal Accidents by Month",
x="Month",
y="Total Fatalities",
color="State") +
theme_minimal() +
theme(legend.position = "bottom",
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.minor = element_blank())
# Save the plot as a PNG file
ggsave("enlarged_state_trends.png", width = 18, height = 12, dpi = 300)
regional_monthly_summary <- dataset %>%
group_by(statename, monthname) %>%
summarize(total_fatalities = sum(fatals, na.rm = TRUE), .groups = 'drop')
high_risk_states <- regional_monthly_summary %>%
group_by(monthname) %>%
slice(which.max(total_fatalities)) %>%
select(monthname, statename, total_fatalities)
cat("States with the highest number of fatalities for each month:\n")
print(high_risk_states)
#Checking if the no.of fatalities have been decreasing or increasing over the months in different regions
regional_trends <- dataset %>%
group_by(statename, monthname) %>%
summarize(total_fatalities = sum(fatals, na.rm = TRUE), .groups = 'drop')
regional_trends <- regional_trends %>%
mutate(month_numeric = match(monthname, month.name))
trend_direction <- regional_trends %>%
group_by(statename) %>%
filter(n() > 1) %>%
summarize(trend = cor(month_numeric, total_fatalities, use = "complete.obs")) %>%
mutate(direction = ifelse(trend > 0, "Increasing", "Decreasing"))
cat("Trend Analysis Summary for Fatalities in 2022:\n")
print(trend_direction)
ARR_HOUR and
ARR_MIN.FATALS column is numeric for analysis.# Calculate EMS arrival time in minutes using only the minute column
dataset$ems_arrival_time <- as.numeric(dataset$minute)
# Check if FATALS column exists and convert it to numeric
if ("fatals" %in% colnames(dataset)) {
dataset$fatals <- as.numeric(dataset$fatals)
} else {
stop("FATALS column is missing in the dataset.")
}
# Categorize fatal accidents into severity levels
dataset$severity_level <- cut(
dataset$fatals,
breaks = c(0, 1, 3, 5, Inf),
labels = c("Low (1 Fatality)", "Medium (2-3 Fatalities)", "High (4-5 Fatalities)", "Very High (6+ Fatalities)")
)
# Overview of the EMS-related data after preparation
glimpse(dataset[, c("ems_arrival_time", "fatals", "severity_level")])
summary(dataset[, c("ems_arrival_time", "fatals", "severity_level")])
ems_stats <- dataset %>%
summarize(
mean_arrival_time = mean(ems_arrival_time, na.rm = TRUE),
median_arrival_time = median(ems_arrival_time, na.rm = TRUE),
sd_arrival_time = sd(ems_arrival_time, na.rm = TRUE),
min_arrival_time = min(ems_arrival_time, na.rm = TRUE),
max_arrival_time = max(ems_arrival_time, na.rm = TRUE),
mean_fatalities = mean(fatals, na.rm = TRUE),
max_fatalities = max(fatals, na.rm = TRUE)
)
print(ems_stats)
ggplot(dataset, aes(x=severity_level, y=ems_arrival_time, fill=severity_level)) +
geom_boxplot(alpha=0.6) +
labs(title="EMS Arrival Time by Fatality Severity Level",
x="Severity Level",
y="EMS Arrival Time (minutes)",
fill="Severity Level") +
theme_minimal()
correlation_severity <- cor.test(dataset$ems_arrival_time, dataset$fatals, use="complete.obs")
cat("Correlation between EMS Arrival Time and Severity (Number of Fatalities):\n")
print(correlation_severity)
linear_model <- lm(fatals ~ ems_arrival_time, data = dataset)
summary(linear_model)
ggplot(dataset, aes(x = ems_arrival_time, y = fatals)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = TRUE, color = "blue") +
labs(title = "Linear Regression: EMS Arrival Time vs Number of Fatalities",
x = "EMS Arrival Time (minutes)",
y = "Number of Fatalities") +
theme_minimal()
We will use an ANOVA test to see if there’s a statistically significant difference in EMS arrival times across different severity levels.
anova_result <- aov(ems_arrival_time ~ severity_level, data = dataset)
summary(anova_result)
TukeyHSD(anova_result)
dataset$high_severity <- ifelse(dataset$fatals > 3, 1, 0)
logistic_model <- glm(high_severity ~ ems_arrival_time, data = dataset, family = binomial)
summary(logistic_model)
exp(coef(logistic_model))
cat("Summary of Findings:\n")
if (correlation_severity$p.value < 0.05) {
cat("- The analysis indicates a significant relationship between EMS arrival time and the severity of accidents.\n")
if (correlation_severity$estimate > 0) {
cat("- Longer EMS response times are associated with more severe fatal outcomes.\n")
} else {
cat("- Shorter EMS response times are associated with less severe fatal outcomes.\n")
}
} else {
cat("- There is no significant relationship between EMS arrival time and the severity of accidents.\n")
}
An insightful journey into understanding the patterns behind road fatalities in the U.S. and creating actionable insights for a safer future.
Motor vehicle accidents are a leading cause of unintentional injury-related deaths in the U.S. Using the 2022 FARS dataset, our analysis focuses on revealing trends and risk factors that contribute to fatal crashes.
Our first step is getting ready by loading the necessary packages and the data.
# Loading the necessary libraries
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(PMCMRplus)
library(here)
## here() starts at C:/Users/adity/OneDrive/Desktop/New folder
# Loading the dataset
dataset <- read_csv("dataset.csv")
## Rows: 39480 Columns: 83
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (34): STATENAME, COUNTYNAME, CITYNAME, MONTHNAME, DAY_WEEKNAME, HOURNAME...
## dbl (49): OBJECTID, STATE, ST_CASE, PEDS, PERNOTMVIT, VE_TOTAL, VE_FORMS, PV...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Now viewing the structure of the dataset
str(dataset)
## spc_tbl_ [39,480 × 83] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ OBJECTID : num [1:39480] 1 2 3 4 5 6 7 8 9 10 ...
## $ STATE : num [1:39480] 1 1 1 1 1 1 1 1 1 1 ...
## $ STATENAME : chr [1:39480] "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ ST_CASE : num [1:39480] 10001 10002 10003 10004 10005 ...
## $ PEDS : num [1:39480] 0 0 0 0 1 1 0 0 0 0 ...
## $ PERNOTMVIT : num [1:39480] 0 0 0 0 1 1 0 0 0 0 ...
## $ VE_TOTAL : num [1:39480] 2 2 1 1 1 1 2 1 2 1 ...
## $ VE_FORMS : num [1:39480] 2 2 1 1 1 1 2 1 2 1 ...
## $ PVH_INVL : num [1:39480] 0 0 0 0 0 0 0 0 0 0 ...
## $ PERSONS : num [1:39480] 3 5 2 1 1 5 1 1 3 1 ...
## $ PERMVIT : num [1:39480] 3 5 2 1 1 5 1 1 3 1 ...
## $ COUNTY : num [1:39480] 107 101 115 101 73 101 63 101 71 131 ...
## $ COUNTYNAME : chr [1:39480] "PICKENS (107)" "MONTGOMERY (101)" "ST. CLAIR (115)" "MONTGOMERY (101)" ...
## $ CITY : num [1:39480] 0 0 0 0 0 2130 0 0 0 0 ...
## $ CITYNAME : chr [1:39480] "NOT APPLICABLE" "NOT APPLICABLE" "NOT APPLICABLE" "NOT APPLICABLE" ...
## $ MONTH : num [1:39480] 1 1 1 1 1 1 1 1 1 1 ...
## $ MONTHNAME : chr [1:39480] "January" "January" "January" "January" ...
## $ DAY : num [1:39480] 1 1 1 2 2 2 4 4 4 5 ...
## $ DAYNAME : num [1:39480] 1 1 1 2 2 2 4 4 4 5 ...
## $ DAY_WEEK : num [1:39480] 7 7 7 1 1 1 3 3 3 4 ...
## $ DAY_WEEKNAME: chr [1:39480] "Saturday" "Saturday" "Saturday" "Sunday" ...
## $ YEAR : num [1:39480] 2022 2022 2022 2022 2022 ...
## $ HOUR : num [1:39480] 12 16 1 14 18 18 9 14 11 0 ...
## $ HOURNAME : chr [1:39480] "12:00pm-12:59pm" "4:00pm-4:59pm" "1:00am-1:59am" "2:00pm-2:59pm" ...
## $ MINUTE : num [1:39480] 30 40 33 46 48 28 5 50 40 0 ...
## $ MINUTENAME : chr [1:39480] "30" "40" "33" "46" ...
## $ TWAY_ID : chr [1:39480] "US-82 SR-6" "US-231 SR-53" "CR-KELLY CREEK RD" "I-65" ...
## $ TWAY_ID2 : chr [1:39480] NA NA NA NA ...
## $ ROUTE : num [1:39480] 2 2 4 1 1 6 4 4 2 3 ...
## $ ROUTENAME : chr [1:39480] "US Highway" "US Highway" "County Road" "Interstate" ...
## $ RUR_URB : num [1:39480] 1 1 1 1 2 2 1 1 1 1 ...
## $ RUR_URBNAME : chr [1:39480] "Rural" "Rural" "Rural" "Rural" ...
## $ FUNC_SYS : num [1:39480] 3 3 5 1 1 4 5 5 3 3 ...
## $ FUNC_SYSNAME: chr [1:39480] "Principal Arterial - Other" "Principal Arterial - Other" "Major Collector" "Interstate" ...
## $ RD_OWNER : num [1:39480] 1 1 2 1 1 4 2 2 1 1 ...
## $ RD_OWNERNAME: chr [1:39480] "State Highway Agency" "State Highway Agency" "County Highway Agency" "State Highway Agency" ...
## $ NHS : num [1:39480] 1 1 0 1 1 0 0 0 1 1 ...
## $ NHSNAME : chr [1:39480] "This section IS ON the NHS" "This section IS ON the NHS" "This section IS NOT on the NHS" "This section IS ON the NHS" ...
## $ SP_JUR : num [1:39480] 0 0 0 0 0 0 0 0 0 0 ...
## $ SP_JURNAME : chr [1:39480] "No Special Jurisdiction" "No Special Jurisdiction" "No Special Jurisdiction" "No Special Jurisdiction" ...
## $ MILEPT : num [1:39480] 4 974 0 1595 1342 ...
## $ MILEPTNAME : chr [1:39480] "4" "974" "None" "1595" ...
## $ LATITUDE : num [1:39480] 33.5 32.1 33.4 32.2 33.5 ...
## $ LATITUDENAME: num [1:39480] 33.5 32.1 33.4 32.2 33.5 ...
## $ LONGITUD : num [1:39480] -88.3 -86.1 -86.4 -86.4 -86.7 ...
## $ LONGITUDNAME: num [1:39480] -88.3 -86.1 -86.4 -86.4 -86.7 ...
## $ HARM_EV : num [1:39480] 12 12 42 34 8 8 12 38 12 42 ...
## $ HARM_EVNAME : chr [1:39480] "Motor Vehicle In-Transport" "Motor Vehicle In-Transport" "Tree (Standing Only)" "Ditch" ...
## $ MAN_COLL : num [1:39480] 7 2 0 0 0 0 1 0 6 0 ...
## $ MAN_COLLNAME: chr [1:39480] "Sideswipe - Same Direction" "Front-to-Front" "The First Harmful Event was Not a Collision with a Motor Vehicle in Transport" "The First Harmful Event was Not a Collision with a Motor Vehicle in Transport" ...
## $ RELJCT1 : num [1:39480] 0 0 0 0 0 0 0 0 0 0 ...
## $ RELJCT1NAME : chr [1:39480] "No" "No" "No" "No" ...
## $ RELJCT2 : num [1:39480] 1 1 1 1 1 1 1 1 2 1 ...
## $ RELJCT2NAME : chr [1:39480] "Non-Junction" "Non-Junction" "Non-Junction" "Non-Junction" ...
## $ TYP_INT : num [1:39480] 1 1 1 1 1 1 1 1 2 1 ...
## $ TYP_INTNAME : chr [1:39480] "Not an Intersection" "Not an Intersection" "Not an Intersection" "Not an Intersection" ...
## $ REL_ROAD : num [1:39480] 1 1 4 4 2 1 1 4 1 4 ...
## $ REL_ROADNAME: chr [1:39480] "On Roadway" "On Roadway" "On Roadside" "On Roadside" ...
## $ WRK_ZONE : num [1:39480] 0 0 0 0 0 0 0 0 0 0 ...
## $ WRK_ZONENAME: chr [1:39480] "None" "None" "None" "None" ...
## $ LGT_COND : num [1:39480] 1 1 2 1 2 3 1 1 1 2 ...
## $ LGT_CONDNAME: chr [1:39480] "Daylight" "Daylight" "Dark - Not Lighted" "Daylight" ...
## $ WEATHER : num [1:39480] 1 1 10 10 2 1 1 1 1 1 ...
## $ WEATHERNAME : chr [1:39480] "Clear" "Clear" "Cloudy" "Cloudy" ...
## $ SCH_BUS : num [1:39480] 0 0 0 0 0 0 0 0 0 0 ...
## $ SCH_BUSNAME : chr [1:39480] "No" "No" "No" "No" ...
## $ RAIL : chr [1:39480] "0000000" "0000000" "0000000" "0000000" ...
## $ RAILNAME : chr [1:39480] "Not Applicable" "Not Applicable" "Not Applicable" "Not Applicable" ...
## $ NOT_HOUR : num [1:39480] 12 99 1 14 18 18 99 99 11 0 ...
## $ NOT_HOURNAME: chr [1:39480] "12:00pm-12:59pm" "Unknown" "1:00am-1:59am" "2:00pm-2:59pm" ...
## $ NOT_MIN : num [1:39480] 47 99 33 48 48 26 99 99 36 0 ...
## $ NOT_MINNAME : chr [1:39480] "47" "Unknown" "33" "48" ...
## $ ARR_HOUR : num [1:39480] 13 99 1 15 18 18 99 99 11 0 ...
## $ ARR_HOURNAME: chr [1:39480] "1:00pm-1:59pm" "Unknown EMS Scene Arrival Hour" "1:00am-1:59am" "3:00pm-3:59pm" ...
## $ ARR_MIN : num [1:39480] 4 99 50 9 54 32 99 99 54 33 ...
## $ ARR_MINNAME : chr [1:39480] "4" "Unknown EMS Scene Arrival Minutes" "50" "9" ...
## $ HOSP_HR : num [1:39480] 13 99 99 15 88 99 88 99 12 88 ...
## $ HOSP_HRNAME : chr [1:39480] "1:00pm-1:59pm" "Unknown" "Unknown" "3:00pm-3:59pm" ...
## $ HOSP_MN : num [1:39480] 47 99 99 44 88 99 88 99 41 88 ...
## $ HOSP_MNNAME : chr [1:39480] "47" "Unknown EMS Hospital Arrival Time" "Unknown EMS Hospital Arrival Time" "44" ...
## $ FATALS : num [1:39480] 1 2 1 1 1 1 1 1 1 1 ...
## $ x : num [1:39480] -88.3 -86.1 -86.4 -86.4 -86.7 ...
## $ y : num [1:39480] 33.5 32.1 33.4 32.2 33.5 ...
## - attr(*, "spec")=
## .. cols(
## .. OBJECTID = col_double(),
## .. STATE = col_double(),
## .. STATENAME = col_character(),
## .. ST_CASE = col_double(),
## .. PEDS = col_double(),
## .. PERNOTMVIT = col_double(),
## .. VE_TOTAL = col_double(),
## .. VE_FORMS = col_double(),
## .. PVH_INVL = col_double(),
## .. PERSONS = col_double(),
## .. PERMVIT = col_double(),
## .. COUNTY = col_double(),
## .. COUNTYNAME = col_character(),
## .. CITY = col_double(),
## .. CITYNAME = col_character(),
## .. MONTH = col_double(),
## .. MONTHNAME = col_character(),
## .. DAY = col_double(),
## .. DAYNAME = col_double(),
## .. DAY_WEEK = col_double(),
## .. DAY_WEEKNAME = col_character(),
## .. YEAR = col_double(),
## .. HOUR = col_double(),
## .. HOURNAME = col_character(),
## .. MINUTE = col_double(),
## .. MINUTENAME = col_character(),
## .. TWAY_ID = col_character(),
## .. TWAY_ID2 = col_character(),
## .. ROUTE = col_double(),
## .. ROUTENAME = col_character(),
## .. RUR_URB = col_double(),
## .. RUR_URBNAME = col_character(),
## .. FUNC_SYS = col_double(),
## .. FUNC_SYSNAME = col_character(),
## .. RD_OWNER = col_double(),
## .. RD_OWNERNAME = col_character(),
## .. NHS = col_double(),
## .. NHSNAME = col_character(),
## .. SP_JUR = col_double(),
## .. SP_JURNAME = col_character(),
## .. MILEPT = col_double(),
## .. MILEPTNAME = col_character(),
## .. LATITUDE = col_double(),
## .. LATITUDENAME = col_double(),
## .. LONGITUD = col_double(),
## .. LONGITUDNAME = col_double(),
## .. HARM_EV = col_double(),
## .. HARM_EVNAME = col_character(),
## .. MAN_COLL = col_double(),
## .. MAN_COLLNAME = col_character(),
## .. RELJCT1 = col_double(),
## .. RELJCT1NAME = col_character(),
## .. RELJCT2 = col_double(),
## .. RELJCT2NAME = col_character(),
## .. TYP_INT = col_double(),
## .. TYP_INTNAME = col_character(),
## .. REL_ROAD = col_double(),
## .. REL_ROADNAME = col_character(),
## .. WRK_ZONE = col_double(),
## .. WRK_ZONENAME = col_character(),
## .. LGT_COND = col_double(),
## .. LGT_CONDNAME = col_character(),
## .. WEATHER = col_double(),
## .. WEATHERNAME = col_character(),
## .. SCH_BUS = col_double(),
## .. SCH_BUSNAME = col_character(),
## .. RAIL = col_character(),
## .. RAILNAME = col_character(),
## .. NOT_HOUR = col_double(),
## .. NOT_HOURNAME = col_character(),
## .. NOT_MIN = col_double(),
## .. NOT_MINNAME = col_character(),
## .. ARR_HOUR = col_double(),
## .. ARR_HOURNAME = col_character(),
## .. ARR_MIN = col_double(),
## .. ARR_MINNAME = col_character(),
## .. HOSP_HR = col_double(),
## .. HOSP_HRNAME = col_character(),
## .. HOSP_MN = col_double(),
## .. HOSP_MNNAME = col_character(),
## .. FATALS = col_double(),
## .. x = col_double(),
## .. y = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(dataset)
## OBJECTID STATE STATENAME ST_CASE
## Min. : 1 Min. : 1.0 Length:39480 Min. : 10001
## 1st Qu.: 9871 1st Qu.:12.0 Class :character 1st Qu.:121923
## Median :19740 Median :26.0 Mode :character Median :261050
## Mean :19740 Mean :27.2 Mean :272763
## 3rd Qu.:29610 3rd Qu.:42.0 3rd Qu.:420773
## Max. :39480 Max. :56.0 Max. :560118
##
## PEDS PERNOTMVIT VE_TOTAL VE_FORMS PVH_INVL
## Min. : 0.0 Min. : 0.0 Min. : 1.0 Min. : 1.0 Min. : 0.00
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 1.0 1st Qu.: 1.0 1st Qu.: 0.00
## Median : 0.0 Median : 0.0 Median : 1.0 Median : 1.0 Median : 0.00
## Mean : 0.2 Mean : 0.3 Mean : 1.6 Mean : 1.5 Mean : 0.04
## 3rd Qu.: 0.0 3rd Qu.: 0.0 3rd Qu.: 2.0 3rd Qu.: 2.0 3rd Qu.: 0.00
## Max. :73.0 Max. :73.0 Max. :50.0 Max. :50.0 Max. :10.00
##
## PERSONS PERMVIT COUNTY COUNTYNAME
## Min. : 0.0 Min. : 0.0 Min. : 0 Length:39480
## 1st Qu.: 1.0 1st Qu.: 1.0 1st Qu.: 31 Class :character
## Median : 2.0 Median : 2.0 Median : 69 Mode :character
## Mean : 2.2 Mean : 2.2 Mean : 92
## 3rd Qu.: 3.0 3rd Qu.: 3.0 3rd Qu.:115
## Max. :128.0 Max. :128.0 Max. :999
##
## CITY CITYNAME MONTH MONTHNAME
## Min. : 0 Length:39480 Min. : 1.0 Length:39480
## 1st Qu.: 0 Class :character 1st Qu.: 4.0 Class :character
## Median : 190 Mode :character Median : 7.0 Mode :character
## Mean :1308 Mean : 6.7
## 3rd Qu.:2010 3rd Qu.:10.0
## Max. :9999 Max. :12.0
##
## DAY DAYNAME DAY_WEEK DAY_WEEKNAME YEAR
## Min. : 1.0 Min. : 1.0 Min. :1.00 Length:39480 Min. :2022
## 1st Qu.: 8.0 1st Qu.: 8.0 1st Qu.:2.00 Class :character 1st Qu.:2022
## Median :16.0 Median :16.0 Median :4.00 Mode :character Median :2022
## Mean :15.7 Mean :15.7 Mean :4.13 Mean :2022
## 3rd Qu.:23.0 3rd Qu.:23.0 3rd Qu.:6.00 3rd Qu.:2022
## Max. :31.0 Max. :31.0 Max. :7.00 Max. :2022
##
## HOUR HOURNAME MINUTE MINUTENAME
## Min. : 0.0 Length:39480 Min. : 0.0 Length:39480
## 1st Qu.: 7.0 Class :character 1st Qu.:14.0 Class :character
## Median :14.0 Mode :character Median :30.0 Mode :character
## Mean :13.5 Mean :29.3
## 3rd Qu.:19.0 3rd Qu.:45.0
## Max. :99.0 Max. :99.0
##
## TWAY_ID TWAY_ID2 ROUTE ROUTENAME
## Length:39480 Length:39480 Min. :1.00 Length:39480
## Class :character Class :character 1st Qu.:2.00 Class :character
## Mode :character Mode :character Median :3.00 Mode :character
## Mean :3.87
## 3rd Qu.:6.00
## Max. :9.00
##
## RUR_URB RUR_URBNAME FUNC_SYS FUNC_SYSNAME
## Min. :1.00 Length:39480 Min. : 1.0 Length:39480
## 1st Qu.:1.00 Class :character 1st Qu.: 3.0 Class :character
## Median :2.00 Mode :character Median : 4.0 Mode :character
## Mean :1.63 Mean : 4.6
## 3rd Qu.:2.00 3rd Qu.: 5.0
## Max. :9.00 Max. :99.0
##
## RD_OWNER RD_OWNERNAME NHS NHSNAME
## Min. : 1.0 Length:39480 Min. :0.00 Length:39480
## 1st Qu.: 1.0 Class :character 1st Qu.:0.00 Class :character
## Median : 1.0 Mode :character Median :0.00 Mode :character
## Mean :13.5 Mean :0.48
## 3rd Qu.: 4.0 3rd Qu.:1.00
## Max. :99.0 Max. :9.00
##
## SP_JUR SP_JURNAME MILEPT MILEPTNAME
## Min. :0.00 Length:39480 Min. : 0 Length:39480
## 1st Qu.:0.00 Class :character 1st Qu.: 19 Class :character
## Median :0.00 Mode :character Median : 138 Mode :character
## Mean :0.03 Mean :27695
## 3rd Qu.:0.00 3rd Qu.:99998
## Max. :9.00 Max. :99999
##
## LATITUDE LATITUDENAME LONGITUD LONGITUDNAME HARM_EV
## Min. : 18.0 Min. : 18.0 Min. :-159 Min. :-159 Min. : 1.0
## 1st Qu.: 32.9 1st Qu.: 32.9 1st Qu.: -99 1st Qu.: -99 1st Qu.: 8.0
## Median : 36.1 Median : 36.1 Median : -88 Median : -88 Median :12.0
## Mean : 36.6 Mean : 36.6 Mean : -87 Mean : -87 Mean :17.9
## 3rd Qu.: 40.4 3rd Qu.: 40.4 3rd Qu.: -81 3rd Qu.: -81 3rd Qu.:26.0
## Max. :100.0 Max. :100.0 Max. :1000 Max. :1000 Max. :99.0
##
## HARM_EVNAME MAN_COLL MAN_COLLNAME RELJCT1
## Length:39480 Min. : 0 Length:39480 Min. :0.00
## Class :character 1st Qu.: 0 Class :character 1st Qu.:0.00
## Mode :character Median : 0 Mode :character Median :0.00
## Mean : 2 Mean :0.08
## 3rd Qu.: 2 3rd Qu.:0.00
## Max. :99 Max. :9.00
##
## RELJCT1NAME RELJCT2 RELJCT2NAME TYP_INT
## Length:39480 Min. : 1.0 Length:39480 Min. : 1.0
## Class :character 1st Qu.: 1.0 Class :character 1st Qu.: 1.0
## Mode :character Median : 1.0 Mode :character Median : 1.0
## Mean : 2.5 Mean : 1.8
## 3rd Qu.: 2.0 3rd Qu.: 2.0
## Max. :99.0 Max. :99.0
##
## TYP_INTNAME REL_ROAD REL_ROADNAME WRK_ZONE
## Length:39480 Min. : 1.0 Length:39480 Min. :0.00
## Class :character 1st Qu.: 1.0 Class :character 1st Qu.:0.00
## Mode :character Median : 1.0 Mode :character Median :0.00
## Mean : 2.4 Mean :0.04
## 3rd Qu.: 4.0 3rd Qu.:0.00
## Max. :99.0 Max. :4.00
##
## WRK_ZONENAME LGT_COND LGT_CONDNAME WEATHER
## Length:39480 Min. :1.00 Length:39480 Min. : 1
## Class :character 1st Qu.:1.00 Class :character 1st Qu.: 1
## Mode :character Median :2.00 Mode :character Median : 1
## Mean :1.99 Mean : 6
## 3rd Qu.:3.00 3rd Qu.: 1
## Max. :9.00 Max. :99
##
## WEATHERNAME SCH_BUS SCH_BUSNAME RAIL
## Length:39480 Min. :0.000 Length:39480 Length:39480
## Class :character 1st Qu.:0.000 Class :character Class :character
## Mode :character Median :0.000 Mode :character Mode :character
## Mean :0.003
## 3rd Qu.:0.000
## Max. :1.000
##
## RAILNAME NOT_HOUR NOT_HOURNAME NOT_MIN
## Length:39480 Min. : 0.0 Length:39480 Min. : 0.0
## Class :character 1st Qu.:16.0 Class :character 1st Qu.:35.0
## Mode :character Median :99.0 Mode :character Median :98.0
## Mean :61.4 Mean :69.8
## 3rd Qu.:99.0 3rd Qu.:99.0
## Max. :99.0 Max. :99.0
##
## NOT_MINNAME ARR_HOUR ARR_HOURNAME ARR_MIN
## Length:39480 Min. : 0.0 Length:39480 Min. : 0.0
## Class :character 1st Qu.:16.0 Class :character 1st Qu.:37.0
## Mode :character Median :99.0 Mode :character Median :98.0
## Mean :62.7 Mean :70.8
## 3rd Qu.:99.0 3rd Qu.:99.0
## Max. :99.0 Max. :99.0
##
## ARR_MINNAME HOSP_HR HOSP_HRNAME HOSP_MN
## Length:39480 Min. : 0.0 Length:39480 Min. : 0
## Class :character 1st Qu.:88.0 Class :character 1st Qu.:88
## Mode :character Median :88.0 Mode :character Median :88
## Mean :77.8 Mean :81
## 3rd Qu.:99.0 3rd Qu.:99
## Max. :99.0 Max. :99
##
## HOSP_MNNAME FATALS x y
## Length:39480 Min. :1.00 Min. :-159.5 Min. :18.0
## Class :character 1st Qu.:1.00 1st Qu.: -99.1 1st Qu.:32.9
## Mode :character Median :1.00 Median : -88.0 Median :36.1
## Mean :1.08 Mean : -92.7 Mean :36.3
## 3rd Qu.:1.00 3rd Qu.: -81.5 3rd Qu.:40.3
## Max. :9.00 Max. : -65.5 Max. :71.3
## NA's :232 NA's :232
dataset <- dataset %>% distinct()
missing_values <- colSums(is.na(dataset))
print(missing_values[missing_values > 0])
## TWAY_ID2 x y
## 29573 232 232
# Drop columns with missing values
dataset <- dataset %>%
select(
-c("TWAY_ID2",
"x",
"y"))
Dropping columns with duplicated meaning and irrelevant
dataset <- dataset %>% select(-STATE, -CITY, -COUNTY, -MONTH, -DAYNAME, -DAY_WEEK, -MINUTENAME, -ROUTE, -RUR_URB, -FUNC_SYS, -RD_OWNER, -NHS, -SP_JUR, -LATITUDENAME, -LONGITUDNAME, -MILEPT, -HARM_EV, -MAN_COLL, -MILEPTNAME, -RELJCT1, -RELJCT2, -TYP_INT, -REL_ROAD, -WRK_ZONE, -LGT_COND, -WEATHER, -SCH_BUS, -RAIL, -NOT_MIN, -ARR_MINNAME, -ARR_HOUR, -HOSP_MN, -SCH_BUSNAME, -RAILNAME, -PERNOTMVIT, -VE_FORMS, -PERSONS)
# Lets rename columns for consistency
colnames(dataset)<- str_to_lower(colnames(dataset))
colnames(dataset) <- str_replace_all(colnames(dataset), " ", "_")
colnames(dataset)
# Convert some variables into factor variables
dataset$rur_urbname <- factor(dataset$rur_urbname)
dataset$day_weekname <- factor(dataset$day_weekname, levels = c("Sunday","Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
dataset$func_sysname <- factor(dataset$func_sysname)
dataset$lgt_condname <- factor(dataset$lgt_condname)
dataset$weathername <-factor(dataset$weathername)
dataset$statename <- factor(dataset$statename)
str(dataset)
columns <- c("ve_total", "peds", "permvit", "arr_min", "hosp_hr")
dataset_long <- dataset %>%
select(all_of(columns)) %>% # Select specified columns
pivot_longer(cols = everything(), names_to = "variable", values_to = "value")
# Plotting boxplots to visualize outliers
ggplot(dataset_long, aes(x = variable, y = value)) +
geom_boxplot(outlier.color = "red", outlier.shape = 16, outlier.size = 2) +
facet_wrap(~ variable, scales = "free") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Boxplots of Selected Columns to Identify Outliers",
x = "Variables",
y = "Values")
remove_outliers <- function(df, columns) {
df %>% mutate(across(all_of(columns), ~ ifelse(
. < (quantile(., 0.25, na.rm = TRUE) - 1.5 * IQR(., na.rm = TRUE)) |
. > (quantile(., 0.75, na.rm = TRUE) + 1.5 * IQR(., na.rm = TRUE)),
NA, .))) %>%
drop_na()
}
dataset <- remove_outliers(dataset, columns)
# Reshape data to long format for boxplot visualization after removing outliers
dataset_long <- dataset %>%
select(all_of(columns)) %>%
pivot_longer(cols = everything(), names_to = "variable", values_to = "value")
# Plotting boxplots to visualize the cleaned data without outliers
ggplot(dataset_long, aes(x = variable, y = value)) +
geom_boxplot(outlier.color = "red", outlier.shape = 16, outlier.size = 2) +
facet_wrap(~ variable, scales = "free") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Boxplots of Selected Columns After Outlier Removal",
x = "Variables",
y = "Values")
#{r, include=T, results='markup',message=TRUE} #write.csv(dataset, "cleaned_dataset.csv", row.names = FALSE) #
road_data <- dataset %>%
select(rur_urbname) %>%
drop_na() %>%
filter(!rur_urbname %in% c("Unknown", "Not Reported", "Trafficway Not in State Inventory")) %>%
mutate(
rur_urbname = factor(rur_urbname, labels = c("Rural", "Urban"))
)
Research Question 1: How do temporal factors affect fatal motor vehicle accidents?
SMART question 1: How do day of week and hour of day affect the occurrence and severity of fatal accidents?
SMART question 2: Does day of week affect the day to day variation on total fatalities per day?
Subset for temporal question
# Select columns consists of temporal factors (month, day, day_week, hour) and no of fatalities (fatals)
df_temporal <- dataset %>%
select("monthname",
"day",
"day_weekname",
"hour",
"fatals")
# Create a new column to stored the date
df_temporal <- df_temporal %>%
mutate(
date = as.Date(paste("2022", monthname, day, sep = "-"), format = "%Y-%b-%d"),
day_type = ifelse(day_weekname %in% c("Saturday", "Sunday"), "Weekend", "Weekday"),
fatal_category = ifelse(fatals > 1, "multiple", "one"),
time_of_day = case_when(
hour < 6 ~ "Early Morning",
hour >= 6 & hour <12 ~ "Morning",
hour >= 12 & hour < 18 ~ "Afternoon",
hour >= 18 ~ "Evening"))
# convert into factor variable and set up level
df_temporal$day_weekname <- factor(df_temporal$day_weekname, levels = c("Sunday","Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
df_temporal$time_of_day <- factor(df_temporal$time_of_day, levels = c("Early Morning", "Morning", "Afternoon", "Evening"))
Check if every date in 2022 is included in the data frame, all dates in 2022 are included.
Frequency Analysis: Day of Week and Weekday vs Weekend This section calculates the frequency of occurrences for each day of the week and categorizes the results into weekdays and weekends.
Day of Week Analysis: Sunday to Friday each has frequency of 52 occurrences, while Saturday has 53. The distribution across the days is balanced.
Weekday vs Weekend Analysis: Weekdays have a total of 260 counts, and weekends have 105 counts.
# Filter out unknown hour (hour==99)
df_hour <- df_temporal %>%
filter(hour != 99)
df_hour_accidents <- df_hour %>%
group_by(hour) %>%
summarise(total_accidents = n())%>%
ungroup()
df_hour_accidents
# Top 5 hours with the highest number of fatal accidents
#head(df_hour_accidents[order(df_hour_accidents$total_accidents, decreasing = TRUE), ], 5)
# Top 5 hours with the lowest number of fatal accidents
#head(df_hour_accidents[order(df_hour_accidents$total_accidents), ], 5)
The peak hours for the highest number of accidents occur between 6 PM and 10 PM #### Day of Week
df_wday_accidents <- df_temporal %>%
group_by(day_weekname) %>%
summarise(total_accidents = n())%>%
ungroup()
df_wday_accidents
Saturday and Sunday appear to have higher count of fatal accidents than rest of days of week.
# Count of Fatal Accidents
df_hour %>% ggplot(aes(x= hour, fill = time_of_day)) +
geom_bar()+
labs(x = "hour", y = "Number of Fatal Accidents",
title = "Count of Fatal Accidents by Hour",
fill = "time of day")+
scale_fill_manual(values = c("#999999","#E69F00", "#D55E00", "#0072B2"))+
theme_minimal()
Given the patterns in hour of day, we will categorize day into two time group: Early morning, morning, afternoon, and evening.
df_wday_accidents <- df_wday_accidents %>%
mutate(day_type = ifelse(day_weekname %in% c("Saturday", "Sunday"), "Weekend", "Weekday"))
# Data Visualization on Occurrence of fatal accidents per day of week
df_temporal %>% ggplot(aes(x= day_weekname, fill = day_type)) +
geom_bar()+
labs(x = "Day of the Week", y = "Number of Fatal Accidents",
title = "Frequency of Fatal Accidents by Day of the Week",
fill = "Day of Week") +
scale_fill_manual(values = c("#bfbfbf", "#0072B2"),
labels = c("weekend", "weekday")) +
theme_minimal()
We use Chi-squared test to determine whether accidents are uniformly distributed across time of the day/days of week. #### Hour of Day Observed from the patterns of occurrence of accidents across hours of day and the usual practice, we will divide the hour of day into four groups.
# chi-squared test on the occurrence of accident
table(df_hour$time_of_day)
chisq_result <- chisq.test(table(df_hour$time_of_day))
chisq_result
# Get observed and expected counts
observed <- chisq_result$observed
expected <- chisq_result$expected
# Calculate standardized residuals
standardized_residuals <- (observed - expected) / sqrt(expected)
print(standardized_residuals)
In the data set, six days have 52 occurrences and one has 53. Even though the days are not perfectly equal but the impact will likely be minimal given the small difference between 52 and 53. For simplicity and common assumption, I will be using equal distribution across all seven days in the test.
# chi-squared test on the occurrence of accident
table(df_temporal$day_weekname)
chi_sq_test <- chisq.test(table(df_temporal$day_weekname))
print(chi_sq_test)
# Get observed and expected counts
observed <- chi_sq_test$observed
expected <- chi_sq_test$expected
# Calculate standardized residuals
standardized_residuals <- (observed - expected) / sqrt(expected)
print(standardized_residuals)
P-value for both test are less than 0.05, indicating that there is a statistically significant deviation from a uniform distribution, suggesting that certain hours and days have higher frequencies of accidents compared to others.
Since the majority of accident has one fatality in an accident and limited occurrence on accidents with fatalities more than four, we will group the number of fatalities into “one” and “multiple” fatalities.
hr_contingency <- table(df_hour$time_of_day, df_hour$fatal_category)
hr_contingency
# Severity: categorize accidents into accident with one fatality and multiple
fatality_counts <- df_temporal %>%
group_by(day_weekname, fatal_category) %>%
summarize(count = n())
df_temporal
wday_contingency <- table(df_temporal$day_weekname, df_temporal$fatal_category)
wday_contingency
# Convert contingency table into data frame and rename column names
df_hr_contingency <- data.frame(hr_contingency)
colnames(df_hr_contingency)<- c("time_of_day", "fatal_category", "Freq")
df_hr_contingency
#data viz
grouped_hr_contingency <- df_hr_contingency %>%
group_by(time_of_day) %>%
mutate(proportion = Freq / sum(Freq))
grouped_hr_contingency
grouped_hr_contingency %>%
ggplot(aes(x= time_of_day, y=proportion, fill = fatal_category))+
geom_bar(stat="identity", alpha = 0.75) +
facet_wrap(~ fatal_category,scales = "free_y")
# Convert contingency table into data frame and rename column names
df_wday_contingency <- data.frame(wday_contingency)
colnames(df_wday_contingency)<- c("day_weekname", "fatal_category", "Freq")
# data visualization on Severity by day of week
# Frequency Counts
#df_wday_contingency %>% ggplot(aes(x= day_weekname, y = Freq, fill = fatal_category))+
# geom_bar(stat="identity", alpha = 0.75) +
# facet_wrap(~ fatal_category)+
# labs(x = "Day of the Week", y = "Frequency") +
# theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Proportion
grouped_wday_contingency <- df_wday_contingency %>%
group_by(day_weekname) %>%
mutate(proportion = Freq / sum(Freq))
grouped_wday_contingency
grouped_wday_contingency %>%
ggplot(aes(x= day_weekname, y=proportion, fill = fatal_category))+
geom_bar(stat="identity", alpha = 0.75) +
facet_wrap(~ fatal_category, scales = "free_y") +
labs(x = "Day of the Week", y = "Proportion") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Chi-squared test: hour of day
chi_sq_result <- chisq.test(hr_contingency)
print(chi_sq_result)
# standardized residuals table
# get the expected and observed values
observed <- chi_sq_result$observed
expected <- chi_sq_result$expected
# calculate the standardized residuals
standardized_residuals <- (observed - expected) / sqrt(expected)
print(standardized_residuals)
P-value less than 0.05, we can reject the null hypothesis at 5% level of significance. This indicates a significant relationship between the number of fatalities per accident (‘one’ vs. ‘multiple’) and the time of the day.
chi_sq_test<- chisq.test(wday_contingency)
print(chi_sq_test)
# Get observed and expected counts
observed <- chi_sq_test$observed
expected <- chi_sq_test$expected
# Calculate standardized residuals
standardized_residuals <- (observed - expected) / sqrt(expected)
print(standardized_residuals)
The result reveals a clear pattern in the data, highlighting that weekends are associated with more severe accidents involving multiple fatalities.
Look further into total fatalities per day to provide a more nuanced undertanding on how does temporal factors affect fatal accidents.
# Data frame for daily fatalities
df_daily <- df_temporal %>%
group_by(date, day_weekname) %>%
summarise(
total_accidents = n(),
average_fatals = mean(fatals),
total_fatals = sum(fatals)
) %>%
ungroup()
df_daily <- df_temporal %>%
group_by(date, day_weekname) %>%
summarise(
total_accidents = n(),
average_fatals = mean(fatals),
total_fatals = sum(fatals)
) %>%
ungroup()
# convert to the right data type
df_daily$day_weekname <- factor(df_daily$day_weekname, levels = c("Sunday","Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
# Top 5 Highest Daily Fatalities
head(df_daily[order(df_daily$total_fatals, decreasing = TRUE), ], 5)
# Top 5 Lowest Daily Fatalities
tail(df_daily[order(df_daily$total_fatals, decreasing = TRUE), ], 5)
# names of days of week
df_daily
df_daily <- df_daily %>%
mutate(day_type = ifelse(day_weekname %in% c("Saturday", "Sunday"), "Weekend", "Weekday"))
# Boxplot of daily fatlities by day of week
df_daily %>% ggplot(aes(x = day_weekname, y = total_fatals, fill = day_type))+
geom_boxplot() +
labs(x = "Day of the Week",
y = "Daily Fatalities",
title = "Daily Fatalities by Day of the Week")+
scale_fill_manual(values = c("#bfbfbf","#0072B2"))
The boxplot shows that daily fatalities are generally higher on weekends. This suggests that weekends tend to have a higher count of fatalities, potentially due to increased travel.
df_summary_day_week <- df_daily %>%
group_by(day_weekname) %>%
summarise(
sample_size = n(),
avg_fatalities = mean(total_fatals),
std_fatalities = sd(total_fatals)) %>%
ungroup()
df_summary_day_week
The average and standard deviation of daily fatalities are notably higher on weekends, indicating both a higher mean and greater variability in fatalities compared to weekdays. The sample size is balanced across groups.
# QQ plot
for(day in unique(df_daily$day_weekname)) {
qqnorm(df_daily$total_fatals[df_daily$day_weekname == day],
main = paste("Q-Q Plot for ", day))
qqline(df_daily$total_fatals[df_daily$day_weekname == day], col = "blue")
}
# Shapiro: Normality test
df_daily %>%
group_by(day_weekname) %>%
summarise(normality_p_value = shapiro.test(total_fatals)$p.value)
# Levene test: Homogeneiety of Variance
leveneTest(total_fatals ~ day_weekname, data = df_daily)
Since the number of fatalities per day is relatively large and spans a wide range of values, the daily fatality counts approximate a continuous variable. Combined with sufficiently large sample sizes for each day of the week, the Central Limit Theorem applies, making the sampling distribution of the mean fatalities per day approximately normal. Additionally, normality tests on daily fatalities grouped by day of the week do not reject the assumption of normality. Therefore, we can treat fatalities per day as approximately continuous and appropriately use ANOVA to test for differences across the days of the week
After doing the Leven’s test for homogeneity, we can conclude that the variances across the groups are significantly different at 5% level of significance. This indicates that the homogeneity assumption of ANOVA test is violated. So we will be using Welch’s ANOVA test in the following section
# Welch's ANOVA
Wanova_result <- oneway.test(total_fatals ~ day_weekname, data = df_daily, var.equal = FALSE)
Wanova_result
# post hoc test-Games Howell
posthoc_result <- gamesHowellTest(total_fatals ~ day_weekname, data = df_daily)
print(posthoc_result)
p-values <0.05, there is a statistically significant difference (p-value < 0.05) in the number of daily fatalities across different days of the week. This indicates that day of week can affect daily fatality counts.
Let’s summarize the data to calculate the total fatalities by state.
state_county_summary <- dataset %>%
group_by(statename, countyname) %>%
summarise(Total_Fatalities = sum(fatals)) %>%
ungroup()
## `summarise()` has grouped output by 'statename'. You can override using the
## `.groups` argument.
state_summary <- state_county_summary %>%
group_by(statename) %>%
summarise(Total_Fatalities = sum(Total_Fatalities)) %>%
arrange(desc(Total_Fatalities))
# 1. State-Level Analysis: Plot top 10 states by total fatalities
top_state_summary <- state_summary %>%
slice_head(n = 10) # Select top 10 states by total fatalities
state_bar_plot <- ggplot(top_state_summary, aes(x = reorder(statename, Total_Fatalities), y = Total_Fatalities, fill = Total_Fatalities)) +
geom_bar(stat = "identity", color = "black", width = 0.7) +
geom_text(aes(label = Total_Fatalities), hjust = -0.2, color = "black", size = 4) +
labs(title = "Top 10 States by Total Fatalities",
x = "State",
y = "Total Number of Fatalities") +
theme_minimal() +
scale_fill_gradient(low = "lightblue", high = "darkred") +
theme(
plot.title = element_text(hjust = 0.5, size = 16),
legend.position = "none"
) +
coord_flip() # Horizontal bar plot
state_plotly <- ggplotly(state_bar_plot, tooltip = c("x", "y"))
state_plotly
# 2. County-Level Analysis: Plot top 10 counties by fatalities within a selected state
selected_state <- "Texas"
county_data <- state_county_summary %>%
filter(statename == selected_state) %>%
arrange(desc(Total_Fatalities)) %>%
slice_head(n = 10)
county_bar_plot <- ggplot(county_data, aes(x = reorder(countyname, Total_Fatalities), y = Total_Fatalities, fill = Total_Fatalities)) +
geom_bar(stat = "identity", color = "black", width = 0.7) +
geom_text(aes(label = Total_Fatalities), hjust = -0.2, color = "black", size = 4) +
labs(title = paste("Top 10 Counties by Total Fatalities in", selected_state),
x = "County",
y = "Total Number of Fatalities") +
theme_minimal() +
scale_fill_gradient(low = "lightyellow", high = "darkorange") +
theme(
plot.title = element_text(hjust = 0.5, size = 16),
legend.position = "none"
) +
coord_flip()
county_plotly <- ggplotly(county_bar_plot, tooltip = c("x", "y"))
county_plotly
Research Question 2: What is the relationship between weather and lighting conditions on the occurrence of fatal accidents?
library(dplyr)
library(ggplot2)
library(plotly)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
# Filtering out unknown or not reported values in the dataset
dataset <- dataset %>%
filter(!(weathername %in% c("Not Reported", "Reported as Unknown"))) %>%
filter(!(lgt_condname %in% c("Not Reported", "Reported as Unknown")))
# Aggregate data for weather conditions
weather_data <- dataset %>%
group_by(weathername) %>%
summarise(Total_Fatalities = sum(fatals, na.rm = TRUE)) %>%
arrange(desc(Total_Fatalities))
lighting_data <- dataset %>%
group_by(lgt_condname) %>%
summarise(Total_Fatalities = sum(fatals, na.rm = TRUE)) %>%
arrange(desc(Total_Fatalities))
# 1. Plot for Weather Conditions with Log Scale and Original Value Labels
weather_plot <- ggplot(weather_data, aes(x = reorder(weathername, Total_Fatalities), y = Total_Fatalities)) +
geom_bar(stat = "identity", fill = "skyblue") +
coord_flip() +
scale_y_log10() +
labs(title = "Fatalities by Weather Conditions", x = "Weather Condition", y = "Total Fatalities (Log Scale)") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) +
geom_text(aes(label = comma(Total_Fatalities)),
hjust = -0.2, size = 3, color = "black")
weather_plotly <- ggplotly(weather_plot, tooltip = c("x", "y"))
weather_plotly
# 2. Plot for Lighting Conditions with Log Scale and Original Value Labels
lighting_plot <- ggplot(lighting_data, aes(x = reorder(lgt_condname, Total_Fatalities), y = Total_Fatalities)) +
geom_bar(stat = "identity", fill = "orange") +
coord_flip() +
scale_y_log10() +
labs(title = "Fatalities by Lighting Conditions", x = "Lighting Condition", y = "Total Fatalities (Log Scale)") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) +
geom_text(aes(label = comma(Total_Fatalities)),
hjust = -0.2, size = 3, color = "black")
lighting_plotly <- ggplotly(lighting_plot, tooltip = c("x", "y"))
lighting_plotly
# Aggregating fatal counts by weather and lighting
weather_light <- dataset %>%
group_by(weathername, lgt_condname) %>%
summarize(total_fatals = sum(fatals, na.rm = TRUE))
## `summarise()` has grouped output by 'weathername'. You can override using the
## `.groups` argument.
# Interactive Heatmap
heatmap_plot <- plot_ly(weather_light, x = ~weathername, y = ~lgt_condname, z = ~total_fatals, type = "heatmap") %>%
layout(title = "Fatality Count by Weather and Lighting Condition",
xaxis = list(title = "Weather Condition"),
yaxis = list(title = "Lighting Condition"))
heatmap_plot
state_analysis <- dataset %>%
filter(weathername %in% c("Rain", "Sleet or Hail", "Blowing Snow","Blowing Sand,Soil Dirt","Severe Crosswinds","Fog,Smog,Smoke","Freezing Rain or Drizzle","Snow"), lgt_condname %in% c("Dark-Not Lighted", "Dark - Lighted","Dark- Unknown Lighting")) %>%
group_by(statename) %>%
summarise(Total_Fatalities = sum(fatals, na.rm = TRUE)) %>%
arrange(desc(Total_Fatalities))
# Select the top 10 states with the highest fatalities
top_adverse_states <- head(state_analysis, 10)
ggplot(top_adverse_states, aes(x = reorder(statename, Total_Fatalities), y = Total_Fatalities, fill = Total_Fatalities)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_gradient(low = "blue", high = "red") +
labs(title = "Top 10 States by Fatal Accidents in Adverse Weather or Poor Lighting",
x = "State", y = "Total Fatalities") +
theme_minimal() +
theme(axis.text.y = element_text(size = 10),
legend.position = "none")
state_analysis <- dataset %>%
filter(weathername %in% c("Clear","Cloudy"), lgt_condname %in% c("Daylight", "Dawn","Dusk")) %>%
group_by(statename) %>%
summarise(Total_Fatalities = sum(fatals, na.rm = TRUE)) %>%
arrange(desc(Total_Fatalities))
# Select the top 10 states with the highest fatalities
top_states <- head(state_analysis, 10)
ggplot(top_states, aes(x = reorder(statename, Total_Fatalities), y = Total_Fatalities, fill = Total_Fatalities)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_gradient(low = "blue", high = "red") +
labs(title = "States with highest Fatal Accidents in clear weather and good lighting",
x = "State", y = "Total Fatalities") +
theme_minimal() +
theme(axis.text.y = element_text(size = 10),
legend.position = "none")
anova_lighting <- aov(fatals ~ lgt_condname, data = dataset)
summary(anova_lighting)
## Df Sum Sq Mean Sq F value Pr(>F)
## lgt_condname 6 3 0.534 4.52 0.00014 ***
## Residuals 21994 2597 0.118
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tukey_results <- TukeyHSD(anova_lighting, "lgt_condname")
print(tukey_results)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = fatals ~ lgt_condname, data = dataset)
##
## $lgt_condname
## diff lwr upr p adj
## Dark - Not Lighted-Dark - Lighted 0.00636 -0.0146 0.027261 0.973
## Dark - Unknown Lighting-Dark - Lighted 0.00457 -0.0529 0.062097 1.000
## Dawn-Dark - Lighted -0.03764 -0.0887 0.013414 0.310
## Daylight-Dark - Lighted -0.01869 -0.0376 0.000196 0.055
## Dusk-Dark - Lighted -0.00662 -0.0540 0.040757 1.000
## Other-Dark - Lighted 0.10072 -0.3527 0.554125 0.995
## Dark - Unknown Lighting-Dark - Not Lighted -0.00178 -0.0585 0.054969 1.000
## Dawn-Dark - Not Lighted -0.04399 -0.0942 0.006186 0.130
## Daylight-Dark - Not Lighted -0.02505 -0.0414 -0.008663 0.000
## Dusk-Dark - Not Lighted -0.01297 -0.0594 0.033460 0.983
## Other-Dark - Not Lighted 0.09436 -0.3590 0.547673 0.996
## Dawn-Dark - Unknown Lighting -0.04221 -0.1156 0.031203 0.619
## Daylight-Dark - Unknown Lighting -0.02327 -0.0793 0.032772 0.885
## Dusk-Dark - Unknown Lighting -0.01119 -0.0821 0.059715 0.999
## Other-Dark - Unknown Lighting 0.09614 -0.3603 0.552611 0.996
## Daylight-Dawn 0.01895 -0.0304 0.068322 0.919
## Dusk-Dawn 0.03102 -0.0347 0.096790 0.807
## Other-Dawn 0.13836 -0.3173 0.594054 0.973
## Dusk-Daylight 0.01207 -0.0335 0.057636 0.987
## Other-Daylight 0.11941 -0.3338 0.572632 0.987
## Other-Dusk 0.10734 -0.3480 0.562637 0.993
# Filter data for adverse and clear weather conditions
adverse_conditions <- c("Rain", "Sleet or Hail", "Blowing Snow","Blowing Sand,Soil Dirt","Severe Crosswinds","Fog,Smog,Smoke","Freezing Rain or Drizzle","Snow")
fair_conditions <- c("Clear","Cloudy")
# Summarize fatalities by state for adverse weather conditions
adverse_weather <- dataset %>%
filter(weathername %in% adverse_conditions) %>%
group_by(statename) %>%
summarise(Adverse_Fatalities = sum(fatals, na.rm = TRUE))
# Summarize fatalities by state for fair weather condition
fair_weather <- dataset %>%
filter(weathername == fair_conditions) %>%
group_by(statename) %>%
summarise(Clear_Fatalities = sum(fatals, na.rm = TRUE))
# Merge the data for adverse and clear weather fatalities by state
weather_comparison <- merge(adverse_weather, fair_weather, by = "statename", all = TRUE)
# Replace NA values with 0 (in case a state has no fatalities in one of the conditions)
weather_comparison[is.na(weather_comparison)] <- 0
# Perform the independent t-test
t_test_result <- t.test(weather_comparison$Adverse_Fatalities, weather_comparison$Clear_Fatalities,
alternative = "two.sided", var.equal = FALSE)
print(t_test_result)
##
## Welch Two Sample t-test
##
## data: weather_comparison$Adverse_Fatalities and weather_comparison$Clear_Fatalities
## t = -4, df = 53, p-value = 1e-04
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -257.8 -91.6
## sample estimates:
## mean of x mean of y
## 35.4 210.0
Research Question 3: How do road conditions (urban/rural, road type, and traffic controls) impact the severity of crashes?
road_fatalities <- dataset %>%
filter(func_sysname != "Unknown" & func_sysname != "Not Reported" & func_sysname != "Trafficway Not in State Inventory") %>%
select(func_sysname, fatals) %>%
drop_na() %>%
mutate(func_sysname = as.factor(func_sysname))
fatality_summary <- road_fatalities %>%
group_by(func_sysname) %>%
summarise(
Total_Fatalities = sum(fatals),
SD_Fatalities = sd(fatals), # Standard deviation for reference
Count = n()
)
print(fatality_summary)
## # A tibble: 7 × 4
## func_sysname Total_Fatalities SD_Fatalities Count
## <fct> <dbl> <dbl> <int>
## 1 Interstate 3223 0.360 2914
## 2 Local 2592 0.263 2446
## 3 Major Collector 4346 0.328 4012
## 4 Minor Arterial 5140 0.345 4703
## 5 Minor Collector 911 0.265 864
## 6 Principal Arterial - Other 6486 0.383 5867
## 7 Principal Arterial - Other Freeways and … 1149 0.350 1049
# Sort the summary data by Total Fatalities for better visual clarity
fatality_summary <- fatality_summary %>%
arrange(desc(Total_Fatalities))
# Enhanced bar plot with additional features
fatality_total_bar_plot <- ggplot(fatality_summary, aes(x = reorder(func_sysname, -Total_Fatalities), y = Total_Fatalities, fill = Total_Fatalities)) +
geom_bar(stat = "identity", color = "black", width = 0.7) +
geom_text(aes(label = Total_Fatalities), vjust = -0.5, color = "black", size = 3.5) +
labs(title = "Total Fatalities by Road Type",
x = "Road Type",
y = "Total Number of Fatalities") +
theme_minimal() +
scale_fill_gradient(low = "lightblue", high = "darkred") +
theme(
plot.title = element_text(hjust = 0.5, size = 16),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none"
)
fatality_total_bar_plotly <- ggplotly(fatality_total_bar_plot, tooltip = c("x", "y", "Total_Fatalities")) %>%
layout(
xaxis = list(title = "Road Type"),
yaxis = list(title = "Total Number of Fatalities"),
width = 900,
height = 500
)
fatality_total_bar_plotly
Since we’re now dealing with total counts rather than averages, a chi-square test can be more suitable than an ANOVA to determine if there are statistically significant differences in the distribution of fatalities across road types.
fatality_table <- table(road_fatalities$func_sysname, road_fatalities$fatals > 0)
chi_square_result <- chisq.test(fatality_table)
print(chi_square_result)
##
## Chi-squared test for given probabilities
##
## data: fatality_table
## X-squared = 18848, df = 9, p-value <2e-16
route_function_summary <- dataset %>%
group_by(routename, func_sysname) %>%
summarise(fatals = n()) %>%
arrange(desc(fatals)) %>%
ungroup()
## `summarise()` has grouped output by 'routename'. You can override using the
## `.groups` argument.
# Filter the top 10 high-frequency routes and functional systems
top_route_function_summary <- route_function_summary %>%
slice_head(n = 10)
crash_freq_plot <- ggplot(top_route_function_summary, aes(x = reorder(routename, fatals), y = fatals, fill = func_sysname)) +
geom_bar(stat = "identity", color = "black", width = 0.7) +
labs(title = "Top Routes and Func Systems by Crash Freq",
x = "Route Name",
y = "Fatals") +
theme_minimal() +
scale_fill_brewer(palette = "Paired") +
theme(
plot.title = element_text(hjust = 0.5, size = 16),
legend.title = element_text(size = 10),
legend.position = "right",
axis.text.x = element_text(angle = 45, hjust = 1)
) +
coord_flip()
crash_freq_plotly <- ggplotly(crash_freq_plot, tooltip = c("x", "y", "fill"))
crash_freq_plotly
crash_freq_heatmap <- ggplot(top_route_function_summary, aes(x = func_sysname, y = reorder(routename, fatals), fill = fatals)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "lightblue", high = "darkred", name = "Crash Count") +
labs(title = "Crash Frequency Heatmap by Route and Functional System",
x = "Functional System",
y = "Route Name") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 16),
axis.text.x = element_text(angle = 45, hjust = 1),
axis.text.y = element_text(size = 10)
)
crash_freq_heatmap_plotly <- ggplotly(crash_freq_heatmap, tooltip = c("x", "y", "fill"))
crash_freq_heatmap_plotly
filtered_dataset <- dataset %>%
filter(!func_sysname %in% c("Unknown", "Not Reported", "Trafficway Not in State Inventory"),
!rur_urbname %in% c("Unknown", "Not Reported", "Trafficway Not in State Inventory"))
# Summarize crash severity and frequency by functional system and urban/rural classification
summary_data <- filtered_dataset %>%
group_by(rur_urbname, func_sysname) %>%
summarise(
Avg_Severity = mean(fatals, na.rm = TRUE),
Crash_Frequency = n()
) %>%
ungroup()
## `summarise()` has grouped output by 'rur_urbname'. You can override using the
## `.groups` argument.
for (func_sys in unique(summary_data$func_sysname)) {
subset <- summary_data %>%
filter(func_sysname == func_sys)
# Plot for Average Crash Severity with Urban and Rural together
severity_plot <- ggplot(subset, aes(x = rur_urbname, y = Avg_Severity, fill = rur_urbname)) +
geom_bar(stat = "identity", position = "dodge", color = "black") +
labs(title = paste("Average Crash Severity by Area Type -", func_sys),
x = "Area Type (Urban/Rural)",
y = "Avg. Severity") +
scale_fill_manual(values = c("Urban" = "skyblue", "Rural" = "salmon")) +
theme_minimal(base_size = 10) +
theme(
plot.title = element_text(size = 12, face = "bold"),
legend.position = "none"
)
print(severity_plot)
# Plot for Crash Frequency with Urban and Rural together
frequency_plot <- ggplot(subset, aes(x = rur_urbname, y = Crash_Frequency, fill = rur_urbname)) +
geom_bar(stat = "identity", position = "dodge", color = "black") +
labs(title = paste("Crash Frequency by Area Type -", func_sys),
x = "Area Type (Urban/Rural)",
y = "Frequency") +
scale_fill_manual(values = c("Urban" = "lightgreen", "Rural" = "darkorange")) +
theme_minimal(base_size = 10) +
theme(
plot.title = element_text(size = 12, face = "bold"),
legend.position = "none"
)
print(frequency_plot)
}
monthname into a factor with a defined order, we ensure that the boxplot accurately reflects the seasonal variations in fatalities. This visualization highlights any monthly patterns or trends in fatal accidents, allowing for a clearer understanding of when fatalities are most likely to occur throughout the year.# Create total_fatalities based on the fatals column
dataset$total_fatalities <- ifelse(dataset$fatals > 0, 1, 0)
# Boxplot of Fatalities by Month
ggplot(dataset, aes(x=factor(monthname, levels=month.name), y=fatals)) + # Convert monthname to a factor
geom_boxplot(fill="lightgreen") +
labs(title="Boxplot of Fatalities by Month", x="Month", y="Total Fatalities") +
theme_minimal()
total_fatalities variable, which aggregates the fatalities from the dataset. We then perform the Shapiro-Wilk normality test to assess whether the data is normally distributed. A Q-Q plot is also generated to visually inspect the normality of the data distribution, helping us understand the underlying characteristics of the fatalities.str(dataset)
dataset$total_fatalities <- dataset$fatals
non_missing_count <- sum(!is.na(dataset$total_fatalities))
cat("Number of non-missing values in total_fatalities:", non_missing_count, "\\n")
unique_values <- unique(dataset$total_fatalities)
summary(dataset$total_fatalities)
cat("Unique values in total_fatalities:", unique_values, "\\n")
if (non_missing_count >= 3 && non_missing_count <= 5000) {
shapiro_test <- shapiro.test(dataset$total_fatalities)
cat("Shapiro-Wilk Normality Test:\\n")
print(shapiro_test)
} else {
cat("Insufficient sample size for Shapiro-Wilk test.\\n")
}
qqnorm(dataset$total_fatalities, main="Q-Q Plot for Total Fatalities")
qqline(dataset$total_fatalities, col="red")
### The Q-Q plot displays the quantiles of the sample data (total fatalities) against the quantiles of a theoretical normal distribution. If the data points closely follow the red line, it indicates that the data is normally distributed.
table_states <- table(dataset$statename)
chi_square_test <- chisq.test(table_states)
cat("Chi-Square Test Results:\n")
print(chi_square_test)
monthly_summary <- dataset %>%
group_by(monthname) %>%
summarize(total_fatalities = sum(fatals, na.rm = TRUE), .groups = 'drop')
monthly_summary$monthname <- factor(monthly_summary$monthname, levels = month.name)
ggplot(monthly_summary, aes(x=monthname, y=total_fatalities)) +
geom_line(group=1, color="blue", size=1.2) +
geom_point(color="red", size=3) +
labs(title="Trends in Fatal Accidents by Month (2022)",
x="Month",
y="Total Fatalities") +
theme_minimal()
# Summarizing total fatalities by state and month
state_monthly_summary <- dataset %>%
group_by(statename, monthname) %>% # Ensure correct column names
summarize(total_fatalities = sum(fatals, na.rm = TRUE), .groups = 'drop')
# Display the head of the summary
print(head(state_monthly_summary))
# Create a factor for monthname to ensure it is ordered correctly
state_monthly_summary$monthname <- factor(state_monthly_summary$monthname, levels = month.name)
# Plotting state-wise trends in fatalities by month
ggplot(state_monthly_summary, aes(x=monthname, y=total_fatalities, color=statename, group=statename)) +
geom_line(size=1, alpha=0.7) +
geom_point(size=2, alpha=0.8) +
scale_y_continuous(breaks = seq(0, max(state_monthly_summary$total_fatalities, na.rm = TRUE), by = 10)) +
labs(title="State-wise Trends in Fatal Accidents by Month",
x="Month",
y="Total Fatalities",
color="State") +
theme_minimal() +
theme(legend.position = "bottom",
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.minor = element_blank())
# Save the plot as a PNG file
ggsave("enlarged_state_trends.png", width = 18, height = 12, dpi = 300)
regional_monthly_summary <- dataset %>%
group_by(statename, monthname) %>%
summarize(total_fatalities = sum(fatals, na.rm = TRUE), .groups = 'drop')
high_risk_states <- regional_monthly_summary %>%
group_by(monthname) %>%
slice(which.max(total_fatalities)) %>%
select(monthname, statename, total_fatalities)
cat("States with the highest number of fatalities for each month:\n")
print(high_risk_states)
regional_trends <- dataset %>%
group_by(statename, monthname) %>%
summarize(total_fatalities = sum(fatals, na.rm = TRUE), .groups = 'drop')
regional_trends <- regional_trends %>%
mutate(month_numeric = match(monthname, month.name))
trend_direction <- regional_trends %>%
group_by(statename) %>%
filter(n() > 1) %>%
summarize(trend = cor(month_numeric, total_fatalities, use = "complete.obs")) %>%
mutate(direction = ifelse(trend > 0, "Increasing", "Decreasing"))
cat("Trend Analysis Summary for Fatalities in 2022:\n")
print(trend_direction)
ARR_HOUR and ARR_MIN.FATALS column is numeric for analysis.# Calculate EMS arrival time in minutes using only the minute column
dataset$ems_arrival_time <- as.numeric(dataset$arr_min)
# Check if FATALS column exists and convert it to numeric
if ("fatals" %in% colnames(dataset)) {
dataset$fatals <- as.numeric(dataset$fatals)
} else {
stop("FATALS column is missing in the dataset.")
}
# Categorize fatal accidents into severity levels
dataset$severity_level <- cut(
dataset$fatals,
breaks = c(0, 1, 3, 5, Inf),
labels = c("Low (1 Fatality)", "Medium (2-3 Fatalities)", "High (4-5 Fatalities)", "Very High (6+ Fatalities)")
)
# Overview of the EMS-related data after preparation
glimpse(dataset[, c("ems_arrival_time", "fatals", "severity_level")])
summary(dataset[, c("ems_arrival_time", "fatals", "severity_level")])
ems_stats <- dataset %>%
summarize(
mean_arrival_time = mean(ems_arrival_time, na.rm = TRUE),
median_arrival_time = median(ems_arrival_time, na.rm = TRUE),
sd_arrival_time = sd(ems_arrival_time, na.rm = TRUE),
min_arrival_time = min(ems_arrival_time, na.rm = TRUE),
max_arrival_time = max(ems_arrival_time, na.rm = TRUE),
mean_fatalities = mean(fatals, na.rm = TRUE),
max_fatalities = max(fatals, na.rm = TRUE)
)
print(ems_stats)
ggplot(dataset, aes(x=severity_level, y=ems_arrival_time, fill=severity_level)) +
geom_boxplot(alpha=0.6) +
labs(title="EMS Arrival Time by Fatality Severity Level",
x="Severity Level",
y="EMS Arrival Time (minutes)",
fill="Severity Level") +
theme_minimal()
correlation_severity <- cor.test(dataset$ems_arrival_time, dataset$fatals, use="complete.obs")
cat("Correlation between EMS Arrival Time and Severity (Number of Fatalities):\n")
print(correlation_severity)
linear_model <- lm(fatals ~ ems_arrival_time, data = dataset)
summary(linear_model)
ggplot(dataset, aes(x = ems_arrival_time, y = fatals)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = TRUE, color = "blue") +
labs(title = "Linear Regression: EMS Arrival Time vs Number of Fatalities",
x = "EMS Arrival Time (minutes)",
y = "Number of Fatalities") +
theme_minimal()
We will use an ANOVA test to see if there’s a statistically significant difference in EMS arrival times across different severity levels.
anova_result <- aov(ems_arrival_time ~ severity_level, data = dataset)
summary(anova_result)
TukeyHSD(anova_result)
dataset$high_severity <- ifelse(dataset$fatals > 3, 1, 0)
logistic_model <- glm(high_severity ~ ems_arrival_time, data = dataset, family = binomial)
summary(logistic_model)
exp(coef(logistic_model))
cat("Summary of Findings:\n")
if (correlation_severity$p.value < 0.05) {
cat("- The analysis indicates a significant relationship between EMS arrival time and the severity of accidents.\n")
if (correlation_severity$estimate > 0) {
cat("- Longer EMS response times are associated with more severe fatal outcomes.\n")
} else {
cat("- Shorter EMS response times are associated with less severe fatal outcomes.\n")
}
} else {
cat("- There is no significant relationship between EMS arrival time and the severity of accidents.\n")
}